In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.svm import SVC

# Preprocessing setup
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)
    return text

# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess text data
train_data['clean_text'] = train_data['text'].apply(preprocess_text)
test_data['clean_text'] = test_data['text'].apply(preprocess_text)

# Split training data into train/validation sets
X = train_data['clean_text']
y = train_data['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Increased feature space for better model performance
    ngram_range=(1, 3),  # Added 3-grams for more contextual understanding
    stop_words='english'
)

# Feature selection
feature_selector = SelectKBest(chi2, k='all')

# Define models
ridge_model = RidgeClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
logreg_model = LogisticRegression(random_state=42)
svc_model = SVC(random_state=42)

# Create a pipeline with a placeholder classifier
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('feature_selection', feature_selector),
    ('classifier', ridge_model)  # Placeholder
])

# Define parameter grid for RandomizedSearchCV
ridge_param_distributions = {
    'classifier': [ridge_model],
    'classifier__alpha': np.logspace(-3, 3, 7),  # Larger range of alpha values
    'tfidf__max_features': [3000, 4000, 5000],  # More feature options
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

gb_param_distributions = {
    'classifier': [gb_model],
    'classifier__n_estimators': [100, 200, 300, 400],  # Expanded n_estimators range
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate tuning
    'tfidf__max_features': [3000, 4000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

logreg_param_distributions = {
    'classifier': [logreg_model],
    'classifier__C': np.logspace(-3, 3, 7),  # Regularization parameter
    'tfidf__max_features': [3000, 4000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

svc_param_distributions = {
    'classifier': [svc_model],
    'classifier__C': [0.1, 1, 10],  # Regularization strength
    'classifier__kernel': ['linear', 'rbf'],  # Kernel types for SVM
    'tfidf__max_features': [3000, 4000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

# Perform RandomizedSearchCV for all models
ridge_randomized_search = RandomizedSearchCV(pipeline, ridge_param_distributions, n_iter=20, cv=5, scoring='f1', verbose=2, random_state=42)
gb_randomized_search = RandomizedSearchCV(pipeline, gb_param_distributions, n_iter=20, cv=5, scoring='f1', verbose=2, random_state=42)
logreg_randomized_search = RandomizedSearchCV(pipeline, logreg_param_distributions, n_iter=20, cv=5, scoring='f1', verbose=2, random_state=42)
svc_randomized_search = RandomizedSearchCV(pipeline, svc_param_distributions, n_iter=20, cv=5, scoring='f1', verbose=2, random_state=42)

# Fit all RandomizedSearchCV models
ridge_randomized_search.fit(X_train, y_train)
gb_randomized_search.fit(X_train, y_train)
logreg_randomized_search.fit(X_train, y_train)
svc_randomized_search.fit(X_train, y_train)

# Get best models
ridge_best_model = ridge_randomized_search.best_estimator_
gb_best_model = gb_randomized_search.best_estimator_
logreg_best_model = logreg_randomized_search.best_estimator_
svc_best_model = svc_randomized_search.best_estimator_

# Stacking Classifier - combine Ridge, GradientBoosting, LogisticRegression, and SVC
stacking_model = StackingClassifier(
    estimators=[
        ('ridge', ridge_best_model),
        ('gb', gb_best_model),
        ('logreg', logreg_best_model),
        ('svc', svc_best_model)
    ],
    final_estimator=LogisticRegression()  # Use Logistic Regression as the meta-model
)

# Fit the stacked model
stacking_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = stacking_model.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
print(f"Validation F1 Score: {f1}, Precision: {precision}, Recall: {recall}")

# Predict on the test dataset
test_data['target'] = stacking_model.predict(test_data['clean_text'])

# Save submission file
submission = test_data[['id', 'target']]
submission.to_csv('submission_optimized_stacking.csv', index=False)
print("Submission file created: submission_optimized_stacking.csv")
