In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/minor2/train.csv')

In [None]:
print(df.columns)

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')


preprocessing dataset

In [None]:
df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [None]:
import string
df['question1'] = df['question1'].str.translate(str.maketrans('', '', string.punctuation))
df['question2'] = df['question2'].str.translate(str.maketrans('', '', string.punctuation))

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['question1'] = df['question1'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['question2'] = df['question2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(df[['question1', 'question2']].isnull().sum())

df = df.dropna(subset=['question1', 'question2'])

print(df[['question1', 'question2']].isnull().sum())

question1    1
question2    2
dtype: int64
question1    0
question2    0
dtype: int64


In [None]:
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['question1'] = df['question1'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df['question2'] = df['question2'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df['question1'] = df['question1'].str.strip()
df['question2'] = df['question2'].str.strip()

In [None]:
df.to_csv('/content/drive/My Drive/minor2/quora_questions_cleaned.csv', index=False)

Feature Engineering

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
import string
import joblib

In [None]:
df['q1_length'] = df['question1'].apply(len)
df['q2_length'] = df['question2'].apply(len)


df['q1_word_count'] = df['question1'].apply(lambda x: len(x.split()))
df['q2_word_count'] = df['question2'].apply(lambda x: len(x.split()))

df['common_words'] = df.apply(lambda row: len(set(row['question1'].split()).intersection(set(row['question2'].split()))), axis=1)

df['word_overlap_ratio'] = df.apply(lambda row: row['common_words'] / (len(set(row['question1'].split())) + len(set(row['question2'].split())) - row['common_words']), axis=1)

print(df[['q1_length', 'q2_length', 'q1_word_count', 'q2_word_count', 'common_words', 'word_overlap_ratio']].head())

ZeroDivisionError: division by zero

In [None]:
empty_q1 = df[df['question1'].str.strip() == '']
empty_q2 = df[df['question2'].str.strip() == '']

print("Empty question1 rows:", empty_q1.shape[0])
print("Empty question2 rows:", empty_q2.shape[0])

Empty question1 rows: 0
Empty question2 rows: 0


In [None]:
df['question1'] = df['question1'].replace('', 'unknown')
df['question2'] = df['question2'].replace('', 'unknown')

In [None]:
def calculate_word_overlap_ratio(row):
    q1_words = set(row['question1'].split())
    q2_words = set(row['question2'].split())
    common_words = len(q1_words.intersection(q2_words))
    total_unique_words = len(q1_words) + len(q2_words) - common_words


    if total_unique_words == 0:
        return 0.0
    return common_words / total_unique_words


df['word_overlap_ratio'] = df.apply(calculate_word_overlap_ratio, axis=1)

In [None]:
print(df[['q1_length', 'q2_length', 'q1_word_count', 'q2_word_count', 'common_words', 'word_overlap_ratio']].head())

   q1_length  q2_length  q1_word_count  q2_word_count  common_words  \
0         41         35              7              6             5   
1         31         67              4              9             2   
2         44         36              6              5             2   
3         21         40              3              5             0   
4         60         29             10              5             2   

   word_overlap_ratio  
0            0.833333  
1            0.222222  
2            0.222222  
3            0.000000  
4            0.153846  


In [None]:
empty_q1 = df[df['question1'].str.strip() == '']
empty_q2 = df[df['question2'].str.strip() == '']

print("Empty question1 rows:", empty_q1.shape[0])
print("Empty question2 rows:", empty_q2.shape[0])

df['question1'] = df['question1'].replace('', 'unknown')
df['question2'] = df['question2'].replace('', 'unknown')


df['q1_length'] = df['question1'].apply(len)
df['q2_length'] = df['question2'].apply(len)

df['q1_word_count'] = df['question1'].apply(lambda x: len(x.split()))
df['q2_word_count'] = df['question2'].apply(lambda x: len(x.split()))

df['common_words'] = df.apply(lambda row: len(set(row['question1'].split()).intersection(set(row['question2'].split()))), axis=1)


def calculate_word_overlap_ratio(row):
    q1_words = set(row['question1'].split())
    q2_words = set(row['question2'].split())
    common_words = len(q1_words.intersection(q2_words))
    total_unique_words = len(q1_words) + len(q2_words) - common_words

    if total_unique_words == 0:
        return 0.0
    return common_words / total_unique_words

df['word_overlap_ratio'] = df.apply(calculate_word_overlap_ratio, axis=1)


print(df[['q1_length', 'q2_length', 'q1_word_count', 'q2_word_count', 'common_words', 'word_overlap_ratio']].head())

Empty question1 rows: 0
Empty question2 rows: 0
   q1_length  q2_length  q1_word_count  q2_word_count  common_words  \
0         66         57             14             12            10   
1         51         88              8             13             4   
2         73         59             14             10             4   
3         50         65             11              9             0   
4         76         39             13              7             2   

   word_overlap_ratio  
0            0.769231  
1            0.250000  
2            0.200000  
3            0.000000  
4            0.111111  


In [None]:
df['q1_length'] = df['question1'].apply(len)
df['q2_length'] = df['question2'].apply(len)

df['q1_word_count'] = df['question1'].apply(lambda x: len(x.split()))
df['q2_word_count'] = df['question2'].apply(lambda x: len(x.split()))


df['common_words'] = df.apply(lambda row: len(set(row['question1'].split()).intersection(set(row['question2'].split()))), axis=1)

df['word_overlap_ratio'] = df.apply(lambda row: row['common_words'] / (len(set(row['question1'].split())) + len(set(row['question2'].split())) - row['common_words']), axis=1)

print(df[['q1_length', 'q2_length', 'q1_word_count', 'q2_word_count', 'common_words', 'word_overlap_ratio']].head())

   q1_length  q2_length  q1_word_count  q2_word_count  common_words  \
0         41         35              7              6             5   
1         31         67              4              9             2   
2         44         36              6              5             2   
3         21         40              3              5             0   
4         60         29             10              5             2   

   word_overlap_ratio  
0            0.833333  
1            0.222222  
2            0.222222  
3            0.000000  
4            0.153846  


TEXT VECTORIZATION USING TF-IDF

In [None]:
questions = df['question1'].tolist() + df['question2'].tolist()

vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(questions)

X_train_q1 = vectorizer.transform(df['question1'])
X_train_q2 = vectorizer.transform(df['question2'])

X = hstack((X_train_q1, X_train_q2, df[['q1_length', 'q2_length', 'q1_word_count', 'q2_word_count', 'common_words', 'word_overlap_ratio']]))

y = df['is_duplicate']

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

XG-BOOST


In [None]:
model = XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=5)

model.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)

In [None]:
!pip install --upgrade xgboost scikit-learn

import xgboost
import sklearn

print("XGBoost version:", xgboost.__version__)
print("Scikit-learn version:", sklearn.__version__)

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

using BERT for vectorization -

In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/minor2/train.csv')

df = df.dropna(subset=['question1', 'question2'])

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

**text embeddings using BERT.**

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
df['q1_embedding'] = df['question1'].apply(lambda x: model.encode(x))
df['q2_embedding'] = df['question2'].apply(lambda x: model.encode(x))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'df' is not defined

In [None]:
print(df['q1_embedding'] )

0         [0.06814991, -0.039664146, -0.060967196, 0.007...
1         [-0.04679809, 0.15511496, -0.039200205, 0.0487...
2         [-0.028324885, 0.0372096, -0.00040038596, 0.01...
3         [0.06325339, -0.0563931, 0.045972113, 0.108220...
4         [-0.048768442, -0.02553886, -0.036212735, -0.0...
                                ...                        
404285                                                  NaN
404286                                                  NaN
404287                                                  NaN
404288                                                  NaN
404289                                                  NaN
Name: q1_embedding, Length: 404287, dtype: object


In [None]:
print(df.head())

   id  qid1  qid2                                          question1  \
0   0     1     2  what is the step by step guide to invest in sh...   
1   1     3     4  what is the story of kohinoor (koh-i-noor) dia...   
2   2     5     6  how can i increase the speed of my internet co...   
3   3     7     8  why am i mentally very lonely? how can i solve...   
4   4     9    10  which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  \
0  what is the step by step guide to invest in sh...             0   
1  what would happen if the indian government sto...             0   
2  how can internet speed be increased by hacking...             0   
3  find the remainder when [math]23^{24}[/math] i...             0   
4            which fish would survive in salt water?             0   

                                        q1_embedding  
0  [0.06814991, -0.039664146, -0.060967196, 0.007...  
1  [-0.04679809, 0.15511496, -0.0392