# Step 1 - Testing all possible models

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Load the data
nltk_data = pd.read_csv('nltk_embeddings.csv')
spacy_data = pd.read_csv('spacy_embeddings.csv')

# Drop unnecessary columns
columns_to_drop = ['id', 'qid1', 'qid2', 'clean_question1', 'clean_question2']
nltk_data.drop(columns=columns_to_drop, inplace=True)
spacy_data.drop(columns=columns_to_drop, inplace=True)

# Combining NLTK and spaCy embeddings
combined_data = pd.concat([nltk_data.drop(columns='is_duplicate'), spacy_data.drop(columns='is_duplicate')], axis=1)
target = spacy_data['is_duplicate']  # Assuming is_duplicate columns are the same in both dataframes

# Separate features and target
X = combined_data
y = target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a list to store results
results = []

# Function to evaluate models
def evaluate_model(model, name):
    pipeline = make_pipeline(StandardScaler(), model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='binary')
    results.append((name, accuracy, f1))

# Models to evaluate
models = [
    (LogisticRegression(), "Logistic Regression"),
    (RandomForestClassifier(), "Random Forest"),
    (GradientBoostingClassifier(), "Gradient Boosting"),
    (SVC(), "SVC"),
    (MLPClassifier(max_iter=300), "MLP Neural Network"),
    (GaussianNB(), "Gaussian Naive Bayes"),
    (KNeighborsClassifier(), "KNN")
]

# Evaluate models
for model, name in models:
    evaluate_model(model, name)

# Display results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score'])
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                  Model  Accuracy  F1 Score
0   Logistic Regression    0.7130  0.591168
1         Random Forest    0.7305  0.586973
2     Gradient Boosting    0.7480  0.642553
3                   SVC    0.7560  0.647399
4    MLP Neural Network    0.7370  0.651194
5  Gaussian Naive Bayes    0.6600  0.578686
6                   KNN    0.7115  0.578524


# Step 2 - Fine tuning top 3 models

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define pipelines for each model with StandardScaler and the model
pipelines = {
    'SVC': Pipeline([('scaler', StandardScaler()), ('SVC', SVC())]),
    'MLP': Pipeline([('scaler', StandardScaler()), ('MLP', MLPClassifier(max_iter=300))]),
    'GradientBoosting': Pipeline([('scaler', StandardScaler()), ('GradientBoosting', GradientBoostingClassifier())]),
}

# Define parameter grids for each model
param_grids = {
    'SVC': {
        'SVC__C': [0.1, 1, 10],
        'SVC__kernel': ['linear', 'rbf'],
        'SVC__gamma': ['scale', 'auto']
    },
    'MLP': {
        'MLP__hidden_layer_sizes': [(50,), (100,), (50,50)],
        'MLP__activation': ['relu', 'tanh'],
        'MLP__solver': ['adam', 'sgd'],
        'MLP__learning_rate_init': [0.001, 0.01],
    },
    'GradientBoosting': {
        'GradientBoosting__n_estimators': [100, 200],
        'GradientBoosting__learning_rate': [0.01, 0.1],
        'GradientBoosting__max_depth': [3, 5, 7]
    }
}

# Results dictionary
tuning_results = {}

# Perform grid search for each model
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    tuning_results[model_name] = {'Best Model': best_model, 'Best Params': best_params, 'Best Score': best_score}

# Display fine-tuning results
for model_name, results in tuning_results.items():
    print(f"Model: {model_name}")
    print(f"Best Score: {results['Best Score']}")
    print(f"Best Parameters: {results['Best Params']}\n")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits




Fitting 5 folds for each of 12 candidates, totalling 60 fits
Model: SVC
Best Score: 0.75725
Best Parameters: {'SVC__C': 1, 'SVC__gamma': 'scale', 'SVC__kernel': 'rbf'}

Model: MLP
Best Score: 0.7415
Best Parameters: {'MLP__activation': 'relu', 'MLP__hidden_layer_sizes': (50,), 'MLP__learning_rate_init': 0.001, 'MLP__solver': 'sgd'}

Model: GradientBoosting
Best Score: 0.7565000000000001
Best Parameters: {'GradientBoosting__learning_rate': 0.1, 'GradientBoosting__max_depth': 5, 'GradientBoosting__n_estimators': 200}

