<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [23]:
!pip install spacy
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy



In [24]:
# avoid decoding problems
df = pd.read_csv("train.csv")
new_df = df.sample(n=90000, random_state=1)
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [25]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [27]:
!pip install --upgrade numpy h5py spacy




In [28]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 5.6 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 5.3 MB/s eta 0:00:03
     --------- ------------------------------ 2.9/12.8 MB 5.1 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 4.6 MB/s eta 0:00:03
     ------------ --------------------------- 3.9/12.8 MB 4.0 MB/s eta 0:00:03
     -------------- ------------------------- 4.7/12.8 MB 3.9 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 MB 3.7 MB/s eta 0:00:03
     ------------------ --------------------- 6.0/12.8 MB 3.8 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 3.7 MB/s eta 0:00:02
     ---------------------- -------------

In [30]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████| 404290/404290 [1:22:14<00:00, 81.93it/s]  


In [None]:
print(len(doc[0].vector))

In [31]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [43:34<00:00, 154.63it/s] 


In [42]:
#preprocessed_data(Basic).csv (Simple Preprocessing Feartures)
#preprocessed_data.csv (NLP Features)
if os.path.isfile('preprocessed_data.csv'):
    dfnlp = pd.read_csv("preprocessed_data.csv",encoding='latin-1')
else:
    print("download preprocessed_data.csv from drive or run previous notebook")

if os.path.isfile('preprocessed_data(Basic).csv'):
    dfppro = pd.read_csv("preprocessed_data(Basic).csv",encoding='latin-1')
else:
    print("download preprocessed_data(Basic).csv from drive or run previous notebook")

In [43]:
dfnlp.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,398782,496695,532029,what is the best marketing automation tool for...,what is the best marketing automation tool for...,1,0.874989,0.874989,0.99998,0.99998,...,0.92307,1.0,1.0,0.0,13.0,0.855263,99,99,99,99
1,115086,187729,187730,i am poor but i want to invest what should i do,i am quite poor and i want to be very rich wh...,0,0.666644,0.499988,0.714276,0.624992,...,0.466664,1.0,1.0,3.0,13.5,0.22449,69,67,65,74
2,327711,454161,454162,i am from india and live abroad i met a guy f...,t i e t to thapar university to thapar univers...,0,0.0,0.0,0.428565,0.272725,...,0.115384,0.0,0.0,6.0,23.0,0.047619,26,29,34,43
3,367788,498109,491396,why do so many people in the u s hate the sou...,my boyfriend doesnt feel guilty when he hurts ...,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,17.0,21.5,0.050847,29,41,23,30
4,151235,237843,50930,consequences of bhopal gas tragedy,what was the reason behind the bhopal gas tragedy,0,0.749981,0.599988,0.0,0.0,...,0.33333,1.0,0.0,4.0,7.0,0.542857,55,70,48,69


In [44]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)


In [45]:
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [46]:
# dataframe of nlp features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,398782,1,0.874989,0.874989,0.99998,0.99998,0.92307,0.92307,1.0,1.0,0.0,13.0,0.855263,99,99,99,99
1,115086,0,0.666644,0.499988,0.714276,0.624992,0.583328,0.466664,1.0,1.0,3.0,13.5,0.22449,69,67,65,74
2,327711,0,0.0,0.0,0.428565,0.272725,0.149999,0.115384,0.0,0.0,6.0,23.0,0.047619,26,29,34,43
3,367788,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,21.5,0.050847,29,41,23,30
4,151235,0,0.749981,0.599988,0.0,0.0,0.599988,0.33333,1.0,0.0,4.0,7.0,0.542857,55,70,48,69


In [47]:
# data before preprocessing 
df2.head()

Unnamed: 0,id,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
0,331535,26,44,5,8,2,13,0.15
1,45407,34,44,5,7,4,12,0.33
2,286200,21,38,3,6,2,9,0.22
3,157195,45,73,10,16,5,25,0.2
4,154346,59,64,11,12,8,23,0.35


In [48]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-20.660633,-32.167675,2.436453,3.57874,-11.797638,-8.185505,63.807134,22.978929,2.836385,-3.285232,...,23.574626,-6.288058,-27.423591,21.784428,-29.38689,27.091443,28.375905,13.574432,-25.17661,40.798094
1,-29.945757,-55.347846,20.210478,43.394144,-3.518035,-22.16162,1.259152,23.305232,8.090744,-48.588328,...,4.270028,6.545918,-57.771808,6.523892,-2.134847,8.60494,20.239938,6.12938,-41.767028,38.066482
2,-26.433151,-7.26037,9.025158,13.887715,10.153314,-2.930555,47.487029,-9.28286,-7.071854,4.644652,...,23.804185,-2.787363,-22.982258,16.821755,-18.194621,0.158007,15.18603,7.984229,-16.467006,11.585925
3,24.912692,-21.611806,7.198035,-13.986901,-26.444346,-11.26017,-12.319716,-18.884277,-11.414454,14.632624,...,-24.02015,-9.783607,18.179139,44.030907,43.931766,-19.512516,26.911838,-7.78919,5.21008,12.931108
4,-23.205631,-69.044186,-9.514707,21.033791,-14.763524,-22.741932,49.175217,76.367653,4.505282,-25.713991,...,-0.240367,-21.056331,-67.267107,17.886653,-9.376376,28.389637,32.194398,45.022742,9.512706,66.023441


In [49]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-18.132199,-26.414625,6.35725,-2.776116,-10.028855,-2.629893,55.509494,17.499786,-4.939512,-1.176749,...,24.551529,-7.684039,-24.883157,6.526309,-24.260105,24.287874,23.2071,21.139578,-18.450893,36.307079
1,-32.206099,-68.998,19.817885,29.907963,-28.345223,-30.291167,-5.844764,25.340969,12.290251,-36.099917,...,24.103431,-11.546865,-62.848033,15.441649,-9.592031,16.754833,26.424757,28.875602,-23.002501,16.389977
2,0.1317,-21.840135,22.282422,-0.836112,-11.782656,-5.972448,22.497116,-1.457768,-10.761684,-4.84484,...,39.676084,0.28984,-21.327892,31.24426,-15.868775,10.563363,-9.109159,19.774183,-11.0131,28.307957
3,-2.388362,-9.13774,2.312735,3.684137,-28.42486,-15.988031,4.731862,-10.06229,-12.901812,-12.599918,...,11.053748,-5.903925,0.67717,-7.418555,-5.314642,-12.190648,-3.195339,8.207225,6.415468,9.979167
4,-19.472629,-18.92632,10.867051,-5.542326,-19.215247,-11.929669,28.416747,6.577728,-0.498996,-1.649447,...,7.9198,-5.115303,-11.169792,-4.767877,-16.489245,13.219784,0.407731,17.40829,-16.857232,22.797969


In [50]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 8
Number of features in question1 w2v  dataframe : 96
Number of features in question2 w2v  dataframe : 96
Number of features in final dataframe  : 217


In [74]:

if not os.path.isfile('final_features.csv'):

    # 1. Ensure all dfs only keep ids that exist in df1
    valid_ids = df1['id'].unique()

    df2 = df2[df2['id'].isin(valid_ids)].drop_duplicates(subset=['id'])
    df3_q1 = df3_q1[df3_q1['id'].isin(valid_ids)].drop_duplicates(subset=['id'])
    df3_q2 = df3_q2[df3_q2['id'].isin(valid_ids)].drop_duplicates(subset=['id'])

    # 2. Debug shapes after cleaning
    print("Shapes after aligning IDs:")
    print("df1:", df1.shape)
    print("df2:", df2.shape)
    print("df3_q1:", df3_q1.shape)
    print("df3_q2:", df3_q2.shape)

    # 3. Merge df1 + df2 (basic + NLP features etc.)
    df1 = df1.merge(df2, on='id', how='left', suffixes=('_main', '_df2'))

    # 4. Merge df3_q1 + df3_q2 (question features)
    df_q = df3_q1.merge(df3_q2, on='id', how='left', suffixes=('_q1', '_q2'))

    # 5. Merge everything
    result = df1.merge(df_q, on='id', how='left')

    # 6. Print final shape & save
    print("✅ Final merged shape:", result.shape)
    result.to_csv('final_features.csv', index=False)
    print("✅ final_features.csv saved successfully!")

else:
    print("final_features.csv already exists.")


Shapes after aligning IDs:
df1: (90000, 806)
df2: (90000, 193)
df3_q1: (90000, 97)
df3_q2: (90000, 97)
✅ Final merged shape: (90000, 1190)
✅ final_features.csv saved successfully!


In [75]:

print(result.shape)

(90000, 1190)


In [72]:
result.head()


Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,...,86_q2_y,87_q2_y,88_q2_y,89_q2_y,90_q2_y,91_q2_y,92_q2_y,93_q2_y,94_q2_y,95_q2_y
0,398782,1,0.874989,0.874989,0.99998,0.99998,0.92307,0.92307,1.0,1.0,...,24.551529,-7.684039,-24.883158,6.526309,-24.260105,24.287874,23.2071,21.139578,-18.450893,36.307079
1,115086,0,0.666644,0.499988,0.714276,0.624992,0.583328,0.466664,1.0,1.0,...,24.103432,-11.546865,-62.848034,15.441649,-9.592031,16.754833,26.424757,28.875603,-23.002501,16.389977
2,327711,0,0.0,0.0,0.428565,0.272725,0.149999,0.115384,0.0,0.0,...,39.676083,0.28984,-21.327892,31.244261,-15.868775,10.563363,-9.109159,19.774183,-11.013101,28.307957
3,367788,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.053748,-5.903925,0.67717,-7.418555,-5.314642,-12.190648,-3.195339,8.207225,6.415468,9.979167
4,151235,0,0.749981,0.599988,0.0,0.0,0.599988,0.33333,1.0,0.0,...,7.9198,-5.115303,-11.169791,-4.767877,-16.489244,13.219784,0.407731,17.408291,-16.857231,22.79797


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix


In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Initialize Random Forest
rf = RandomForestClassifier(
    n_estimators=200,   # number of trees
    max_depth=10,       # max depth of each tree
    random_state=42,
    n_jobs=-1
)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Metrics
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


In [None]:
# Initialize XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# Train
xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb.predict(X_test)

# Metrics
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost F1 Score:", f1_score(y_test, y_pred_xgb))
print("XGBoost ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
