# Modelling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def load_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['id', 'qid1', 'qid2', 'is_duplicate', 'clean_question1', 'clean_question2'])
    y = data['is_duplicate']
    return X, y

def train_model(X_train, y_train, model_pipeline):
    model_pipeline.fit(X_train, y_train)
    return model_pipeline

def evaluate_model(model_pipeline, X_test, y_test):
    predictions = model_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    return accuracy, f1, precision, recall

def run_model_pipeline(data_paths):
    results = pd.DataFrame(columns=['Dataset', 'Model', 'Accuracy', 'F1', 'Precision', 'Recall'])
    top_models = {
        'SVC': Pipeline([('scaler', StandardScaler()), ('SVC', SVC())]),
        'LogisticRegression': Pipeline([('scaler', StandardScaler()), ('LogisticRegression', LogisticRegression())]),
        'NaiveBayes': Pipeline([('scaler', StandardScaler()), ('NaiveBayes', GaussianNB())]),
        'KNN': Pipeline([('scaler', StandardScaler()), ('KNN', KNeighborsClassifier())]),
        'MLP': Pipeline([('scaler', StandardScaler()), ('MLP', MLPClassifier())]),
        'GradientBoosting': Pipeline([('scaler', StandardScaler()), ('GradientBoosting', GradientBoostingClassifier())]),
    }

    for model_choice in top_models.keys():
        for data_path in data_paths:
            X, y = load_data(data_path)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model_pipeline = top_models[model_choice]
            trained_model = train_model(X_train, y_train, model_pipeline)
            accuracy, f1, precision, recall = evaluate_model(trained_model, X_test, y_test)
            new_result = pd.DataFrame({'Dataset': [data_path], 'Model': [model_choice], 'Accuracy': [accuracy], 'F1': [f1], 'Precision': [precision], 'Recall': [recall]})
            results = pd.concat([results, new_result], ignore_index=True)

    return results.groupby('Model').mean()

# Example usage
data_paths = ['nltk_embeddings.csv', 'spacy_embeddings.csv', 'nltk_embeddings_bert.csv', 'spacy_embeddings_bert.csv']
run_model_pipeline(data_paths)


  return results.groupby('Model').mean()


Unnamed: 0_level_0,Accuracy,F1,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoosting,0.703,0.589698,0.602802,0.577365
KNN,0.593,0.524799,0.466831,0.6125
LogisticRegression,0.668625,0.539385,0.556679,0.525
MLP,0.685625,0.582382,0.572579,0.593243
NaiveBayes,0.57575,0.588404,0.461285,0.815203
SVC,0.68475,0.548456,0.584668,0.520608


### We oberve that the top 3 models are GradientBoosting, MLP Classifier and SVC. So we would use grid search to perform hyperparamter  optimizaion of these.

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define pipelines for each model with StandardScaler and the model
pipelines = {
    'SVC': Pipeline([('scaler', StandardScaler()), ('SVC', SVC())]),
    'MLP': Pipeline([('scaler', StandardScaler()), ('MLP', MLPClassifier(max_iter=300))]),
    'GradientBoosting': Pipeline([('scaler', StandardScaler()), ('GradientBoosting', GradientBoostingClassifier())]),
}

# Define parameter grids for each model
param_grids = {
    'SVC': {
        'SVC__C': [0.1, 1, 10],
        'SVC__kernel': ['linear', 'rbf'],
        'SVC__gamma': ['scale', 'auto']
    },
    'MLP': {
        'MLP__hidden_layer_sizes': [(50,), (100,), (50,50)],
        'MLP__activation': ['relu', 'tanh'],
        'MLP__solver': ['adam', 'sgd'],
        'MLP__learning_rate_init': [0.001, 0.01],
    },
    'GradientBoosting': {
        'GradientBoosting__n_estimators': [100, 200],
        'GradientBoosting__learning_rate': [0.01, 0.1],
        'GradientBoosting__max_depth': [3, 5, 7]
    }
}

# Results dictionary
tuning_results = {}

# Load data and split it
X, y = load_data(data_paths[0])  # using the first dataset for hyperparameter tuning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search for each model
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    tuning_results[model_name] = {'Best Model': best_model, 'Best Params': best_params, 'Best Score': best_score}

# Display fine-tuning results
for model_name, results in tuning_results.items():
    print(f"Model: {model_name}")
    print(f"Best Score: {results['Best Score']}")
    print(f"Best Parameters: {results['Best Params']}\n")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits




Fitting 5 folds for each of 12 candidates, totalling 60 fits
Model: SVC
Best Score: 0.681875
Best Parameters: {'SVC__C': 10, 'SVC__gamma': 'scale', 'SVC__kernel': 'linear'}

Model: MLP
Best Score: 0.701875
Best Parameters: {'MLP__activation': 'relu', 'MLP__hidden_layer_sizes': (100,), 'MLP__learning_rate_init': 0.001, 'MLP__solver': 'sgd'}

Model: GradientBoosting
Best Score: 0.729
Best Parameters: {'GradientBoosting__learning_rate': 0.1, 'GradientBoosting__max_depth': 5, 'GradientBoosting__n_estimators': 200}



In [6]:

files = [
    'nltk_embeddings.csv',
    'spacy_embeddings.csv',
    'nltk_embeddings_bert.csv',
    'spacy_embeddings_bert.csv',
]

# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    return accuracy, f1, precision, recall

# Loop over each file
for file in files:
    print(f"Running models on file {file}")
    
    # Load data and split it
    X, y = load_data(file)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Loop over each model in tuning results
    for model_name, model_info in tuning_results.items():
        print(f"Training model {model_name} with best parameters")
        
        # Get the best parameters from tuning results
        best_params = model_info['Best Params']
        
        # Remove the model name prefix from the best parameters
        best_params = {k.replace(model_name+'__', ''): v for k, v in best_params.items()}
        
        # Initialize the model with the best parameters
        if model_name == 'SVC':
            model = SVC(**best_params)
        elif model_name == 'MLP':
            model = MLPClassifier(**best_params)
        elif model_name == 'GradientBoosting':
            model = GradientBoostingClassifier(**best_params)
        
        # Fit the model on the training data
        model.fit(X_train, y_train)
        
        # Evaluate the model on the test data
        accuracy, f1, precision, recall = evaluate_model(model, X_test, y_test)
        
        # Print the results
        print(f"Results for model {model_name} on file {file}:")
        print(f"Accuracy: {accuracy}")
        print(f"F1 Score: {f1}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}\n")


Running models on file nltk_embeddings.csv
Training model SVC with best parameters
Results for model SVC on file nltk_embeddings.csv:
Accuracy: 0.6845
F1 Score: 0.6847175264908864
Precision: 0.6849436081178641
Recall: 0.6845

Training model MLP with best parameters
Results for model MLP on file nltk_embeddings.csv:
Accuracy: 0.6775
F1 Score: 0.6826586487799645
Precision: 0.7236948607868243
Recall: 0.6775

Training model GradientBoosting with best parameters
Results for model GradientBoosting on file nltk_embeddings.csv:
Accuracy: 0.7135
F1 Score: 0.7127953344500032
Precision: 0.7121952601132279
Recall: 0.7135

Running models on file spacy_embeddings.csv
Training model SVC with best parameters
Results for model SVC on file spacy_embeddings.csv:
Accuracy: 0.682
F1 Score: 0.6820882841918752
Precision: 0.6821779318560673
Recall: 0.682

Training model MLP with best parameters
Results for model MLP on file spacy_embeddings.csv:
Accuracy: 0.6665
F1 Score: 0.6721961425046898
Precision: 0.70330

| Model Name        | Dataset Name            | Accuracy | F1 Score   | Precision  | Recall |
|-------------------|-------------------------|----------|------------|------------|--------|
| SVC               | nltk_embeddings.csv     | 0.6845   | 0.6847     | 0.6849     | 0.6845 |
| MLP               | nltk_embeddings.csv     | 0.6775   | 0.6827     | 0.7237     | 0.6775 |
| GradientBoosting  | nltk_embeddings.csv     | 0.7135   | 0.7128     | 0.7122     | 0.7135 |
| SVC               | spacy_embeddings.csv    | 0.682    | 0.6821     | 0.6822     | 0.682  |
| MLP               | spacy_embeddings.csv    | 0.6665   | 0.6722     | 0.7033     | 0.6665 |
| GradientBoosting  | spacy_embeddings.csv    | 0.715    | 0.7141     | 0.7133     | 0.715  |
| SVC               | nltk_embeddings_bert.csv| 0.6565   | 0.6600     | 0.6662     | 0.6565 |
| MLP               | nltk_embeddings_bert.csv| 0.6995   | 0.7024     | 0.7076     | 0.6995 |
| GradientBoosting  | nltk_embeddings_bert.csv| 0.702    | 0.6978     | 0.6962     | 0.702  |
| SVC               | spacy_embeddings_bert.csv| 0.6585  | 0.6623     | 0.6696     | 0.6585 |
| MLP               | spacy_embeddings_bert.csv| 0.688   | 0.6835     | 0.6817     | 0.688  |
| GradientBoosting  | spacy_embeddings_bert.csv| 0.7145  | 0.7119     | 0.7106     | 0.7145 |


We can observe that the GradientBoosting Cassifier is working the best with the spacy embedings, it is the highest f1 score


In [8]:
## Saving the GradientBoosting model trained on the spacy_embeddings.csv dataset into a joblib file but now on the entire dataset
from joblib import dump

# Load data
X, y = load_data('spacy_embeddings.csv')

# Initialize the model with the best parameters
best_params = tuning_results['GradientBoosting']['Best Params']
best_params = {k.replace(model_name+'__', ''): v for k, v in best_params.items()}
model = GradientBoostingClassifier(**best_params)

# Fit the model on the entire dataset
model.fit(X, y)

# Save the model to a file
dump(model, 'gbmodel.joblib')

['gbmodel.joblib']