In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, log_loss,f1_score,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
#import mlflow
#import mlflow.sklearn
from sklearn.ensemble import StackingClassifier
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#! pip install mlflow

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_path = '/content/drive/MyDrive/quora/maintraindf.csv'
test_path = '/content/drive/MyDrive/quora/maintestdf.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df['clean_text1'] = train_df['clean_text1'].astype(str)
train_df['clean_text2'] = train_df['clean_text2'].astype(str)

test_df['clean_text1'] = test_df['clean_text1'].astype(str)
test_df['clean_text2'] = test_df['clean_text2'].astype(str)

In [None]:

train_df['question1_token'] = train_df.clean_text1.apply(word_tokenize)
train_df['question2_token'] = train_df.clean_text2.apply(word_tokenize)


test_df['question1_token'] = test_df.clean_text1.apply(word_tokenize)
test_df['question2_token'] = test_df.clean_text2.apply(word_tokenize)

In [None]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)

(323432, 36)
(80858, 36)
Index(['percentage_common_tokens', 'question1_length', 'question2_length',
       'length_difference', 'num_capital_letters1', 'num_capital_letters2',
       'num_question_marks1', 'num_question_marks2', 'starts_with_are',
       'starts_with_can', 'starts_with_how', 'clean_text1', 'clean_text2',
       'word_count1', 'word_count2', 'sentence_count1', 'sentence_count2',
       'avg_word_length1', 'avg_word_length2', 'unique_word_count',
       'similar_word_count', 'fuzzy_word_partial_ratio', 'token_set_ratio',
       'token_sort_ratio', 'word_overlap', 'jaccard_similarity',
       'levenshtein_distance', 'length_ratio', 'common_2grams',
       'common_3grams', 'average_word_frequency1', 'average_word_frequency2',
       'average_word_frequency_diff', 'is_duplicate', 'question1_token',
       'question2_token'],
      dtype='object')


In [None]:
# Dropping Unnecessary Columns
drop_columns =['clean_text1','clean_text2']
train_df.drop(drop_columns,axis=1,inplace=True)
test_df.drop(drop_columns,axis=1,inplace=True)

In [None]:
train_df.columns

Index(['percentage_common_tokens', 'question1_length', 'question2_length',
       'length_difference', 'num_capital_letters1', 'num_capital_letters2',
       'num_question_marks1', 'num_question_marks2', 'starts_with_are',
       'starts_with_can', 'starts_with_how', 'word_count1', 'word_count2',
       'sentence_count1', 'sentence_count2', 'avg_word_length1',
       'avg_word_length2', 'unique_word_count', 'similar_word_count',
       'fuzzy_word_partial_ratio', 'token_set_ratio', 'token_sort_ratio',
       'word_overlap', 'jaccard_similarity', 'levenshtein_distance',
       'length_ratio', 'common_2grams', 'common_3grams',
       'average_word_frequency1', 'average_word_frequency2',
       'average_word_frequency_diff', 'is_duplicate', 'question1_token',
       'question2_token'],
      dtype='object')

In [None]:
tokens = train_df['question1_token'].tolist() + train_df['question2_token'].tolist() + test_df['question1_token'].tolist() + test_df['question2_token'].tolist()
word2vec_model = Word2Vec(tokens, window=5, vector_size=200,min_count=1, workers=4)

# Function to calculate word embeddings for a sentence
def embedding(tokens):
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(200)

train_df['embedding_question1'] = train_df['question1_token'].apply(embedding)
train_df['embedding_question2'] = train_df['question2_token'].apply(embedding)


test_df['embedding_question1'] = test_df['question1_token'].apply(embedding)
test_df['embedding_question2'] = test_df['question2_token'].apply(embedding)

In [None]:
def cos_similarity(row):
    embedding1 = row['embedding_question1']
    embedding2 = row['embedding_question2']
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity_score

train_df['cos_similarity'] = train_df.apply(cos_similarity, axis=1)
test_df['cos_similarity'] = test_df.apply(cos_similarity, axis=1)

In [None]:
y_train = train_df['is_duplicate']
y_test = test_df['is_duplicate']

columns_to_exclude = ['embedding_question1', 'embedding_question2']  # Replace with the actual column names
X_train = train_df.drop(['is_duplicate','question1_token','question2_token'], axis=1)
X_test = test_df.drop(['is_duplicate','question1_token','question2_token'],axis=1)
# Separate columns for feature scaling
columns_to_scale = [col for col in X_train.columns if col not in columns_to_exclude]

scaler = MinMaxScaler()


train_quest1vec = pd.DataFrame(X_train['embedding_question1'].tolist(), columns=[f'q1_{i}' for i in range(200)])
train_quest2vec = pd.DataFrame(X_train['embedding_question2'].tolist(), columns=[f'q2_{i}' for i in range(200)])

test_quest1vec = pd.DataFrame(X_test['embedding_question1'].tolist(), columns=[f'q1_{i}' for i in range(200)])
test_quest2vec = pd.DataFrame(X_test['embedding_question2'].tolist(), columns=[f'q2_{i}' for i in range(200)])

train_scaled_columns = scaler.fit_transform(X_train[columns_to_scale])
test_scaled_columns = scaler.transform(X_test[columns_to_scale])
train_scaled_data = pd.DataFrame(train_scaled_columns, columns=[f'scaled_{i}' for i in range(len(columns_to_scale))])
test_scaled_data = pd.DataFrame(test_scaled_columns, columns=[f'scaled_{i}' for i in range(len(columns_to_scale))])

X_train = pd.concat([train_quest1vec, train_quest2vec, train_scaled_data], axis=1)
X_test = pd.concat([test_quest1vec, test_quest2vec, test_scaled_data], axis=1)

In [None]:
# Reset the indices of X_train and y_train
trainx = X_train.reset_index(drop=True)
trainy = y_train.reset_index(drop=True)

# Concatenate X_train with y_train
train_df_2 = pd.concat([trainx, trainy], axis=1)

# Reset the indices of X_test and y_test
testx = X_test.reset_index(drop=True)
testy = y_test.reset_index(drop=True)

# Concatenate X_test with y_test
test_df_2 = pd.concat([testx, testy], axis=1)

In [None]:
print(train_df_2.shape)
print(test_df_2.shape)

(323432, 433)
(80858, 433)


In [None]:
os.chdir('/content/drive/MyDrive/quora') 

In [None]:
train_path = '/content/drive/MyDrive/quora/trainembeddings.csv'
test_path = '/content/drive/MyDrive/quora/testembeddings.csv'
train_df_2.to_csv(train_path, index=False)
test_df_2.to_csv(test_path, index=False)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_train.columns)

(323432, 432)
(80858, 432)
Index(['q1_0', 'q1_1', 'q1_2', 'q1_3', 'q1_4', 'q1_5', 'q1_6', 'q1_7', 'q1_8',
       'q1_9',
       ...
       'scaled_22', 'scaled_23', 'scaled_24', 'scaled_25', 'scaled_26',
       'scaled_27', 'scaled_28', 'scaled_29', 'scaled_30', 'scaled_31'],
      dtype='object', length=432)


In [None]:
columns_to_scale

['percentage_common_tokens',
 'question1_length',
 'question2_length',
 'length_difference',
 'num_capital_letters1',
 'num_capital_letters2',
 'num_question_marks1',
 'num_question_marks2',
 'starts_with_are',
 'starts_with_can',
 'starts_with_how',
 'word_count1',
 'word_count2',
 'sentence_count1',
 'sentence_count2',
 'avg_word_length1',
 'avg_word_length2',
 'unique_word_count',
 'similar_word_count',
 'fuzzy_word_partial_ratio',
 'token_set_ratio',
 'token_sort_ratio',
 'word_overlap',
 'jaccard_similarity',
 'levenshtein_distance',
 'length_ratio',
 'common_2grams',
 'common_3grams',
 'average_word_frequency1',
 'average_word_frequency2',
 'average_word_frequency_diff',
 'cos_similarity']

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Quora Question Pair Simmilarity")

In [None]:
from pickle import dump

dump(scaler, open('pickle_files/scaler.pkl', 'wb'))
dump(word2vec_model, open('pickle_files/word2vec_model.pkl', 'wb'))

In [None]:
from lightgbm import LGBMClassifier


# Initialize the LightGBM model
lgm = LGBMClassifier()

# Fit the model on the training data
lgm.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_test_proba1 = lgm.predict_proba(X_test)
y_pred_train_proba1 = lgm.predict_proba(X_train)

# # Predict classes on the test set
y_test_pred1 = lgm.predict(X_test)
y_train_pred1 = lgm.predict(X_train)

# Calculate log loss
log_loss_test_score1 = log_loss(y_test, y_pred_test_proba1)
log_loss_train_score1 = log_loss(y_train, y_pred_train_proba1)

# Generate the classification report
f1score_test_1 = f1_score(y_test, y_test_pred1)
f1score_train_1 = f1_score(y_train, y_train_pred1)

accuracy_test_1 = accuracy_score(y_test, y_test_pred1)
accuracy_train_1 = accuracy_score(y_train, y_train_pred1)

classificationtest = classification_report(y_test, y_test_pred1)
classificationtrain = classification_report(y_train, y_train_pred1)

print('Log Loss Test:',log_loss_test_score1)
print('Log Loss Train:',log_loss_train_score1)

print('Classification Report Test\n', classificationtest)
print('******************************************************')
print('Classification Report Train\n', classificationtrain)
  # mlflow.log_metric("Log Loss", log_loss_test_score1)
  # mlflow.log_metric("F1 Score", f1score_test_1)
  # mlflow.log_metric("Accuracy Score", accuracy_test_1)
  # mlflow.sklearn.log_model(lgm, artifact_path="models")
  # mlflow.log_artifact("pickle_files/scaler.pkl")
  # mlflow.log_artifact("pickle_files/word2vec_model.pkl")

Log Loss Test: 0.40124464414832645
Log Loss Train: 0.3886427653618916
Classification Report Test
               precision    recall  f1-score   support

           0       0.84      0.84      0.84     50803
           1       0.73      0.73      0.73     30055

    accuracy                           0.80     80858
   macro avg       0.79      0.79      0.79     80858
weighted avg       0.80      0.80      0.80     80858

******************************************************
Classification Report Train
               precision    recall  f1-score   support

           0       0.85      0.85      0.85    204224
           1       0.75      0.74      0.74    119208

    accuracy                           0.81    323432
   macro avg       0.80      0.80      0.80    323432
weighted avg       0.81      0.81      0.81    323432



In [None]:
from sklearn.naive_bayes import GaussianNB

# with mlflow.start_run():
#   mlflow.set_tag("dev", "NIKAvengers")
#   mlflow.set_tag("algo", "Naive Bayes")

# Initialize the Naive Bayes model
naivebayes = GaussianNB()

# Fit the model on the training data
naivebayes.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_test_proba2 = naivebayes.predict_proba(X_test)
y_pred_train_proba2 = naivebayes.predict_proba(X_train)

# Predict classes on the test set
y_test_pred2 = naivebayes.predict(X_test)
y_train_pred2 = naivebayes.predict(X_train)

# Calculate log loss
log_loss_test_score2 = log_loss(y_test, y_pred_test_proba2)
log_loss_train_score2 = log_loss(y_train, y_pred_train_proba2)

# Generate the classification report
f1score_test_2 = f1_score(y_test, y_test_pred2)
f1score_train_2 = f1_score(y_train, y_train_pred2)

accuracy_test_2 = accuracy_score(y_test, y_test_pred2)
accuracy_train_2 = accuracy_score(y_train, y_train_pred2)

classificationtest2 = classification_report(y_test, y_test_pred2)
classificationtrain2 = classification_report(y_train, y_train_pred2)

print('Log Loss Test:',log_loss_test_score2)
print('Log Loss Train:',log_loss_train_score2)

print('Classification Report Test\n', classificationtest2)
print('******************************************************')
print('Classification Report Train\n', classificationtrain2)
  
  # mlflow.log_metric("Log Loss", log_loss_test_score2)
  # mlflow.log_metric("F1 Score", f1score_test_2)
  # mlflow.log_metric("Accuracy Score", accuracy_test_2)
  # mlflow.sklearn.log_model(naivebayes, artifact_path="models")
  # mlflow.log_artifact("pickle_files/scaler.pkl")
  # mlflow.log_artifact("pickle_files/word2vec_model.pkl")

Log Loss Test: 6.924440209193455
Log Loss Train: 6.973512179798788
Classification Report Test
               precision    recall  f1-score   support

           0       0.84      0.60      0.70     50803
           1       0.54      0.81      0.65     30055

    accuracy                           0.67     80858
   macro avg       0.69      0.70      0.67     80858
weighted avg       0.73      0.67      0.68     80858

******************************************************
Classification Report Train
               precision    recall  f1-score   support

           0       0.84      0.60      0.70    204224
           1       0.54      0.81      0.64    119208

    accuracy                           0.67    323432
   macro avg       0.69      0.70      0.67    323432
weighted avg       0.73      0.67      0.68    323432



In [None]:
import xgboost as xgb

# with mlflow.start_run():
#   mlflow.set_tag("dev", "NIKAvengers")
#   mlflow.set_tag("algo", "XGBoost")
#   # Initialize the XGBoost model
xg_boost = xgb.XGBClassifier()

# Fit the model on the training data
xg_boost.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_test_proba3 = xg_boost.predict_proba(X_test)
y_pred_train_proba3 = xg_boost.predict_proba(X_train)

# Predict classes on the test set
y_test_pred3 = xg_boost.predict(X_test)
y_train_pred3 = xg_boost.predict(X_train)

# Calculate log loss
log_loss_test_score3 = log_loss(y_test, y_pred_test_proba3)
log_loss_train_score3 = log_loss(y_train, y_pred_train_proba3)

# Generate the classification report
f1score_test_3 = f1_score(y_test, y_test_pred3)
f1score_train_3 = f1_score(y_train, y_train_pred3)

accuracy_test_3 = accuracy_score(y_test, y_test_pred3)
accuracy_train_3 = accuracy_score(y_train, y_train_pred3)

classificationtest3 = classification_report(y_test, y_test_pred3)
classificationtrain3 = classification_report(y_train, y_train_pred3)

print('Log Loss Test:',log_loss_test_score3)
print('Log Loss Train:',log_loss_train_score3)

print('Classification Report Test\n', classificationtest3)
print('******************************************************')
print('Classification Report Train\n', classificationtrain3)

  
  
  # mlflow.log_metric("Log Loss", log_loss_test_score3)
  # mlflow.log_metric("F1 Score", f1score_test_3)
  # mlflow.log_metric("Accuracy Score", accuracy_test_3)
  # mlflow.sklearn.log_model(xg_boost, artifact_path="models")
  # mlflow.log_artifact("pickle_files/scaler.pkl")
  # mlflow.log_artifact("pickle_files/word2vec_model.pkl")

Log Loss Test: 0.3787669353437042
Log Loss Train: 0.3301877660356633
Classification Report Test
               precision    recall  f1-score   support

           0       0.85      0.86      0.85     50803
           1       0.75      0.74      0.75     30055

    accuracy                           0.81     80858
   macro avg       0.80      0.80      0.80     80858
weighted avg       0.81      0.81      0.81     80858

******************************************************
Classification Report Train
               precision    recall  f1-score   support

           0       0.88      0.89      0.88    204224
           1       0.80      0.78      0.79    119208

    accuracy                           0.85    323432
   macro avg       0.84      0.84      0.84    323432
weighted avg       0.85      0.85      0.85    323432



In [None]:
lgm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [None]:
xg_boost.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
from sklearn.ensemble import RandomForestClassifier

  # Initialize the Random Forest model
randomforest = RandomForestClassifier()

  # Fit the model on the training data
randomforest.fit(X_train, y_train)

  # Predict probabilities on the test set
y_pred_test_proba4 = randomforest.predict_proba(X_test)
y_pred_train_proba4 = randomforest.predict_proba(X_train)

  # Predict classes on the test set
y_test_pred4 = randomforest.predict(X_test)
y_train_pred4 = randomforest.predict(X_train)

  # Calculate log loss
log_loss_test_score4 = log_loss(y_test, y_pred_test_proba4)
log_loss_train_score4 = log_loss(y_train, y_pred_train_proba4)

  # Generate the classification report
f1score_test_4 = f1_score(y_test, y_test_pred4)
f1score_train_4 = f1_score(y_train, y_train_pred4)

accuracy_test_4 = accuracy_score(y_test, y_test_pred4)
accuracy_train_4 = accuracy_score(y_train, y_train_pred4)


classificationtest4= classification_report(y_test, y_test_pred4)
classificationtrain4= classification_report(y_train, y_train_pred4)

print('Log Loss Test:',log_loss_test_score4)
print('Log Loss Train:',log_loss_train_score4)

print('Classification Report Test\n', classificationtest4)
print('******************************************************')
print('Classification Report Train\n', classificationtrain4)

KeyboardInterrupt: ignored

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

with mlflow.start_run():
    mlflow.set_tag("dev", "NIKAvengers")
    mlflow.set_tag("algo", "GradientBoostingClassifier")

    # Initialize the Gradient Boosting model
    gradientboosting = GradientBoostingClassifier()

    # Fit the model on the training data
    gradientboosting.fit(X_train, y_train)

    # Predict probabilities on the test set
    y_pred_test_proba5 = gradientboosting.predict_proba(X_test)
    y_pred_train_proba5 = gradientboosting.predict_proba(X_train)

    # Predict classes on the test set
    y_test_pred5 = gradientboosting.predict(X_test)
    y_train_pred5 = gradientboosting.predict(X_train)

    # Calculate log loss
    log_loss_test_score5 = log_loss(y_test, y_pred_test_proba5)
    log_loss_train_score5 = log_loss(y_train, y_pred_train_proba5)

    # Generate the classification report
    f1score_test_5 = f1_score(y_test, y_test_pred5)
    f1score_train_5 = f1_score(y_train, y_train_pred5)

    accuracy_test_5 = accuracy_score(y_test, y_test_pred5)
    accuracy_train_5 = accuracy_score(y_train, y_train_pred5)

    mlflow.log_metric("Log Loss", log_loss_test_score5)
    mlflow.log_metric("F1 Score", f1score_test_5)
    mlflow.log_metric("Accuracy Score", accuracy_test_5)
    mlflow.sklearn.log_model(gradientboosting, artifact_path="models")
    mlflow.log_artifact("pickle_files/scaler.pkl")
    mlflow.log_artifact("pickle_files/word2vec_model.pkl")

In [None]:
import xgboost as xgb

# Use GPU-enabled version of XGBoost
from xgboost import XGBClassifier
import lightgbm as lgb

# Set the GPU-related parameters for LightGBM
lgb_params = {
    'device': 'gpu'
}

# Define the base models
base_models = [
    ('lgm', lgb.LGBMClassifier()),
    ('xgb', XGBClassifier(tree_method='gpu_hist')),
    ('randomforest', RandomForestClassifier())
]

In [None]:


# Train the stacking classifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=XGBClassifier(tree_method='gpu_hist'))
stacking_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = stacking_classifier.predict(X_test)
y_pred_train = stacking_classifier.predict(X_train)

# Predict probabilities on the test set
y_pred_proba = stacking_classifier.predict_proba(X_test)
y_pred_proba_train = stacking_classifier.predict_proba(X_train)

# Calculate log loss
log_loss_score = log_loss(y_test, y_pred_proba)
log_loss_score_train = log_loss(y_train, y_pred_proba_train)

# Generate the classification report
report_test = classification_report(y_test, y_pred)
report_train = classification_report(y_train, y_pred_train)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate F1 score
f1score = f1_score(y_test, y_pred)