In [None]:
# Import Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, log_loss
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# Load preprocessed data

train_df = pd.read_csv('/content/drive/MyDrive/train_cleaned.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test_cleaned.csv')

In [None]:
print(train_df.columns)
print(test_df.columns)

Index(['qid1', 'qid2', 'question1', 'question2', 'question1_len',
       'question2_len', 'question_len_diff', 'is_duplicate', 'tokens_q1',
       'pos_tags_q1', 'tokens_q2', 'pos_tags_q2', 'tokens_q1_len',
       'tokens_q2_len', 'tokens_len_diff', 'word_overlap', 'bigram_similarity',
       'trigram_similarity', 'jaccard_similarity', 'levenshtein_distance',
       'fuzzy_ratio', 'token_set_ratio', 'token_sort_ratio',
       'cosine_similarity', 'word_similarity'],
      dtype='object')
Index(['qid1', 'qid2', 'question1', 'question2', 'question1_len',
       'question2_len', 'question_len_diff', 'is_duplicate', 'tokens_q1',
       'pos_tags_q1', 'tokens_q2', 'pos_tags_q2', 'tokens_q1_len',
       'tokens_q2_len', 'tokens_len_diff', 'word_overlap', 'bigram_similarity',
       'trigram_similarity', 'jaccard_similarity', 'levenshtein_distance',
       'fuzzy_ratio', 'token_set_ratio', 'token_sort_ratio',
       'cosine_similarity', 'word_similarity'],
      dtype='object')


In [None]:
train_df.shape, test_df.shape

((282898, 25), (121246, 25))

In [None]:
# Assign X and y for training and test data
x_train = train_df.drop(['qid1', 'qid2','is_duplicate','question1', 'question2', 'tokens_q1',
       'pos_tags_q1', 'tokens_q2', 'pos_tags_q2'], axis = 1)
y_train = train_df['is_duplicate']

x_test = test_df.drop(['qid1', 'qid2','is_duplicate', 'question1', 'question2', 'tokens_q1',
       'pos_tags_q1', 'tokens_q2', 'pos_tags_q2'], axis = 1)
y_test = test_df['is_duplicate']

# Print the shapes of the resulting sets
print("Training set shape:", x_train.shape)
print("Test set shape:", x_test.shape)

Training set shape: (282898, 16)
Test set shape: (121246, 16)


In [None]:
# Handling imbalanced data

smote = SMOTE(random_state=42)
x_train_balanced, y_train_balanced =smote.fit_resample(x_train, y_train)
y_train_balanced.value_counts()

0    178417
1    178417
Name: is_duplicate, dtype: int64

In [None]:
scaler = MinMaxScaler() # Transforming the features into the range[0,1]

X_train = scaler.fit_transform(x_train_balanced) 
X_test = scaler.transform(x_test)

# **Modeling**

---



### **XGBoost Classifier**

In [None]:
# Initialize and train the XGBoost model

xgb = XGBClassifier()
xgb.fit(X_train, y_train_balanced);

Evaluation on the train set

In [None]:
# Predict probabilities on the train set
y_train_pred_proba1 = xgb.predict_proba(X_train)

# Predict classes on the train set
y_train_pred1 = xgb.predict(X_train)

# Accuracy
train_accuracy1 = accuracy_score(y_train_balanced, y_train_pred1)
print("Accuracy:", train_accuracy1)

# Log loss
train_log_loss_score1 = log_loss(y_train_balanced, y_train_pred_proba1)
print("Log Loss:", train_log_loss_score1)

# Generate the classification report
train_report1 = classification_report(y_train_balanced, y_train_pred1)
print(train_report1)

Accuracy: 0.8073922328029279
Log Loss: 0.37452232079958475
              precision    recall  f1-score   support

           0       0.84      0.76      0.80    178417
           1       0.78      0.85      0.82    178417

    accuracy                           0.81    356834
   macro avg       0.81      0.81      0.81    356834
weighted avg       0.81      0.81      0.81    356834



Evaluation on the test set

In [None]:
# Predict probabilities on the test set
y_test_pred_proba1 = xgb.predict_proba(X_test)

# Predict classes on the train set
y_test_pred1 = xgb.predict(X_test)

# Accuracy
test_accuracy1 = accuracy_score(y_test, y_test_pred1)
print("Accuracy:", test_accuracy1)

# Log loss
test_log_loss_score1 = log_loss(y_test, y_test_pred_proba1)
print("Log Loss:", test_log_loss_score1)

# Generate the classification report
test_report1 = classification_report(y_test, y_test_pred1)
print(test_report1)

Accuracy: 0.7481154017452122
Log Loss: 0.46420163770898704
              precision    recall  f1-score   support

           0       0.83      0.75      0.79     76468
           1       0.64      0.75      0.69     44778

    accuracy                           0.75    121246
   macro avg       0.73      0.75      0.74    121246
weighted avg       0.76      0.75      0.75    121246



### **LGBM Classifier**

In [None]:
# Initialize and train the LightGBM model

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train_balanced)

Evaluation on train set 

In [None]:
# Predict probabilities on the train set
y_train_pred_proba2 = lgbm.predict_proba(X_train)

# Predict classes on the train set
y_train_pred2 = lgbm.predict(X_train)

# Accuracy
train_accuracy2 = accuracy_score(y_train_balanced, y_train_pred2)
print("Accuracy:", train_accuracy2)

# Log loss
train_log_loss_score2 = log_loss(y_train_balanced, y_train_pred_proba2)
print("Log Loss:", train_log_loss_score2)

# Generate the classification report
train_report2 = classification_report(y_train_balanced, y_train_pred2)
print(train_report2)

Accuracy: 0.8259358693398051
Log Loss: 0.35444490921190647
              precision    recall  f1-score   support

           0       0.84      0.80      0.82    178417
           1       0.81      0.85      0.83    178417

    accuracy                           0.83    356834
   macro avg       0.83      0.83      0.83    356834
weighted avg       0.83      0.83      0.83    356834



Evaluation on the test set

In [None]:
# Predict probabilities on the test set
y_test_pred_proba2 = lgbm.predict_proba(X_test)

# Predict classes on the train set
y_test_pred2 = lgbm.predict(X_test)

# Accuracy
test_accuracy2 = accuracy_score(y_test, y_test_pred2)
print("Accuracy:", test_accuracy2)

# Log loss
test_log_loss_score2 = log_loss(y_test, y_test_pred_proba2)
print("Log Loss:", test_log_loss_score2)

# Generate the classification report
test_report2 = classification_report(y_test, y_test_pred2)
print(test_report2)

Accuracy: 0.7823680781221648
Log Loss: 0.4252285580598777
              precision    recall  f1-score   support

           0       0.85      0.80      0.82     76468
           1       0.69      0.75      0.72     44778

    accuracy                           0.78    121246
   macro avg       0.77      0.78      0.77    121246
weighted avg       0.79      0.78      0.78    121246



### **Random Forest Classifier**

In [None]:
# Initialize and train the Random Forest model

rf = RandomForestClassifier()
rf.fit(X_train, y_train_balanced)

Evaluation on the train set

In [None]:
# Predict probabilities on the train set
y_train_pred_proba3 = rf.predict_proba(X_train)

# Predict classes on the train set
y_train_pred3 = rf.predict(X_train)

# Accuracy
train_accuracy3 = accuracy_score(y_train_balanced, y_train_pred3)
print("Accuracy:", train_accuracy3)

# Log loss
train_log_loss_score3 = log_loss(y_train_balanced, y_train_pred_proba3)
print("Log Loss:", train_log_loss_score3)

# Generate the classification report
train_report3 = classification_report(y_train_balanced, y_train_pred3)
print(train_report3)

Accuracy: 0.9999831854587847
Log Loss: 0.09951146064144453
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    178417
           1       1.00      1.00      1.00    178417

    accuracy                           1.00    356834
   macro avg       1.00      1.00      1.00    356834
weighted avg       1.00      1.00      1.00    356834



Evaluation on test set

In [None]:
# Predict probabilities on the test set
y_test_pred_proba3 = rf.predict_proba(X_test)

# Predict classes on the train set
y_test_pred3 = rf.predict(X_test)

# Accuracy
test_accuracy3 = accuracy_score(y_test, y_test_pred3)
print("Accuracy:", test_accuracy3)

# Log loss
test_log_loss_score3 = log_loss(y_test, y_test_pred_proba3)
print("Log Loss:", test_log_loss_score3)

# Generate the classification report
test_report3 = classification_report(y_test, y_test_pred3)
print(test_report3)

Accuracy: 0.7911353776619435
Log Loss: 0.4242468063529653
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     76468
           1       0.70      0.76      0.73     44778

    accuracy                           0.79    121246
   macro avg       0.78      0.78      0.78    121246
weighted avg       0.80      0.79      0.79    121246



### **Ensemble**

In [None]:
# Create the ensemble classifier
ensemble_classifier = VotingClassifier(
    estimators=[('lgbm', lgbm), ('xgboost', xgb), ('rf', rf)],
    voting='soft'  # Voting method: 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Train the ensemble classifier
ensemble_classifier.fit(X_train, y_train_balanced)
  
# Make predictions on the traina and test set
ensemble_predictions_proba_train = ensemble_classifier.predict_proba(X_train)
ensemble_predictions_train = ensemble_classifier.predict(X_train)

ensemble_predictions_proba_test = ensemble_classifier.predict_proba(X_test)
ensemble_predictions_test = ensemble_classifier.predict(X_test)

# Evaluate performance on train set
accuracy_train = accuracy_score(y_train_balanced, ensemble_predictions_train)
log_loss_train = log_loss(y_train_balanced, ensemble_predictions_proba_train)
report_train = classification_report(y_train_balanced, ensemble_predictions_train)

print("Train Accuracy:", accuracy_train)
print("Train Log Loss:", log_loss_train)
print("Train Classification Report:")
print(report_train)

# Evaluate performance on test set
accuracy_test = accuracy_score(y_test, ensemble_predictions_test)
log_loss_test = log_loss(y_test, ensemble_predictions_proba_test)
report_test = classification_report(y_test, ensemble_predictions_test)

print("Test Accuracy:", accuracy_test)
print("Test Log Loss:", log_loss_test)
print("Test Classification Report:")
print(report_test)


Train Accuracy: 0.9256432963226598
Train Log Loss: 0.23548461224602282
Train Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92    178417
           1       0.92      0.94      0.93    178417

    accuracy                           0.93    356834
   macro avg       0.93      0.93      0.93    356834
weighted avg       0.93      0.93      0.93    356834

Test Accuracy: 0.7928838889530376
Test Log Loss: 0.41113686286841183
Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     76468
           1       0.70      0.77      0.73     44778

    accuracy                           0.79    121246
   macro avg       0.78      0.79      0.78    121246
weighted avg       0.80      0.79      0.79    121246

