##***Featurizing the Text Data with TFIDF weighted Word2Vectors.***

In [None]:
#importing all the relevant modules
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy
from prettytable import PrettyTable

In [None]:
#loading the data set using drive function of google colab
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
cd gdrive/My\ Drive/Colab Notebooks

In [None]:
#uploading the data set and publishing the number of rows
df = pd.read_csv("Quora_Train_Dataset.csv")
print("Number of data points : ",df.shape[0])

Number of data points :  404290


In [None]:
#converting the type of data set as picked up the raw version
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [None]:
#publishing top rows of the Dataframe
df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [None]:
#Merge texts
questions = list(df['question1']) + list(df['question2'])
tfidf     = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

<808580x109679 sparse matrix of type '<class 'numpy.float64'>'
	with 8146555 stored elements in Compressed Sparse Row format>

In [None]:
#dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

***Steps ahead are :***



1.   __After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.__
2.   __Here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity__
3.   __It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.__

In [None]:
#using spacy library to convert all the words in each of the question 1 into a vector
nlp   = spacy.load('en_core_web_sm')
vecs1 = []
for qu1 in tqdm(list(df['question1'])):
    doc1      = nlp(qu1) 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        vec1  = word1.vector
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|██████████| 404290/404290 [1:10:04<00:00, 96.16it/s]


In [None]:
#using spacy library to convert all the words in each of the question 2 into a vector
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2      = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        vec2  = word2.vector
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [1:10:47<00:00, 95.17it/s]


***Uploading the CSV File created after feature Engineering in Notebook 2***


In [None]:
#veryfying the presence of csv file in the drive and accrodingly processing  
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("Kindly download the file : nlp_features_train.csv from the Github or run previous notebooks again")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("Kindly download the file : df_fe_without_preprocessing_train.csv from the Github or run previous notebooks again")

In [None]:
#droping the columns from the DataFrame that are not relevant to modeling anymore
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [None]:
#Publishing shape of each of these Tables
print('Shape of df1    :',df1.shape)
print('Shape of df2    :',df2.shape)
print('Shape of df3    :',df3.shape)
print('Shape of df3_q1 :',df3_q1.shape)
print('Shape of df3_q2 :',df3_q2.shape)

Shape of df1    : (404290, 28)
Shape of df2    : (404290, 12)
Shape of df3    : (404290, 3)
Shape of df3_q1 : (404290, 96)
Shape of df3_q2 : (404290, 96)


In [None]:
#dataframe of nlp features
df1.head(3)

Unnamed: 0,id,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,4,1,51,88,8,13,4.0,20.0,0.2,5,3,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,1,1,73,59,14,10,4.0,24.0,0.166667,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667


In [None]:
#dataframe before preprocessing 
df2.head(3)

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0


In [None]:
#Questions 1 tfidf weighted word2vec
df3_q1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,-6.179507,37.450731,-67.929894,32.224274,143.348826,135.374574,17.865208,54.562352,81.618936,232.909839,27.167002,-6.18722,41.996069,-103.537911,-6.616976,-94.064813,-31.617267,-29.20589,-39.608413,30.163852,67.357976,56.346944,7.606743,20.720049,-56.899175,-41.419606,30.634747,-20.667785,93.049776,-108.506989,134.494691,-69.708587,57.89103,-94.501012,-34.043306,-88.04459,197.875458,-56.918224,-33.153865,-68.587951,...,114.638685,50.816876,-57.381595,106.554903,98.56738,-67.311119,88.728154,48.158799,-12.962363,-108.238607,77.74089,-76.930486,-55.488544,55.403322,-134.602412,15.887729,-14.8127,-43.172916,-71.213891,66.473449,138.86432,-134.555638,11.072384,11.770594,23.349589,-83.244351,-55.722126,-21.89321,8.263763,-14.915859,-71.834689,-60.222858,-22.026407,103.33672,-68.477445,-54.976584,-67.802663,116.269999,60.515897,-12.245916
1,9.236668,-80.371416,-45.785907,78.291656,183.568221,100.894077,74.344804,48.360802,127.297421,112.987302,73.449294,-47.164479,31.56061,-77.927155,-103.803116,-78.503396,11.997354,73.522302,-3.368269,-61.134586,1.137853,-47.250332,-14.74723,-111.214488,-92.936345,-86.908993,26.479862,-125.349442,53.630314,-118.244944,-19.311728,4.184269,37.147654,-32.207492,54.693156,-146.564649,190.351235,-50.185377,-102.660014,-62.070884,...,69.116646,123.042798,2.978409,-52.171842,-77.587376,-43.825259,-2.570918,25.001977,74.528037,55.402768,53.763126,-27.6213,-62.024373,67.296625,-127.6723,5.919581,51.888908,5.646263,-52.011487,-22.036677,150.269148,-19.587007,-46.445237,-12.489538,104.02092,-83.863392,-11.340908,26.33933,16.131961,137.782501,-32.130515,-98.080325,19.11379,-20.507508,-76.981011,82.665075,41.085582,129.377781,115.868467,4.383543
2,97.546829,22.972195,-39.558378,18.723416,56.92862,48.307643,8.719268,36.893737,106.899948,226.28308,-5.023025,-68.019793,-52.865684,-112.794187,49.411453,4.386397,-23.508985,3.867846,-19.287308,-52.332033,59.900114,102.718565,-6.070379,-47.957186,-51.316635,26.235306,-71.29717,-82.694041,61.166595,-59.66162,85.192997,95.725261,-9.78694,-44.572019,-26.737537,-8.913334,67.037574,-85.818559,-137.332815,-58.704829,...,126.969506,32.991508,11.751381,32.92163,127.876085,-117.452091,-53.697893,116.44984,-7.114503,-28.417017,22.236688,-95.3655,-13.684289,67.952175,-19.670386,0.049363,63.119457,21.524016,-36.342319,7.531785,101.659882,-90.213289,-11.944407,-20.185547,16.732628,9.608647,-77.976104,77.923826,59.257286,-78.643996,-66.835015,87.592131,4.032431,56.851709,-43.62541,-57.580963,-50.425829,78.591986,105.714348,-33.304161


In [None]:
#Questions 2 tfidf weighted word2vec
df3_q2.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,-14.616981,59.755488,-53.263745,19.514497,113.916473,101.657056,8.561499,66.232769,32.888127,210.812733,-14.521656,5.732026,40.648458,-95.787979,-1.157125,-66.563451,-31.051373,-17.968147,-30.063745,32.889709,65.43054,54.742656,1.756501,27.786168,-46.373962,-25.822312,51.231668,-20.655695,98.152685,-96.04839,126.935466,-28.066691,82.523253,-91.751655,-24.696691,-67.690101,171.840663,-62.797236,-22.777055,-60.322768,...,120.843786,26.592192,-50.285062,114.594113,100.273874,-72.797047,85.467161,42.393031,-18.239859,-103.664753,68.317974,-46.44902,-60.184247,47.26953,-113.987847,10.32098,0.135103,-29.869163,-78.649177,77.262293,131.28716,-123.969027,20.822789,20.718112,38.093118,-85.086696,-61.891319,3.91541,-58.229361,-49.735812,-72.266625,-37.072086,-31.14273,94.064854,-45.053242,-34.155221,-76.548099,99.282776,50.791731,-17.566246
1,-3.565742,-16.844571,-130.911785,0.320254,79.350278,23.562028,79.124551,84.119839,128.684135,279.539877,51.329233,-47.977027,124.3227,-106.766716,-35.756386,-119.277429,21.692362,79.32057,-30.87448,-86.451965,-75.311001,-10.50403,-54.459205,-82.283126,-122.537507,-23.89715,38.062149,-120.668866,96.555255,-172.800629,71.174992,52.39928,-3.790546,-88.446559,-1.359644,-169.229485,242.162139,-106.348765,-118.454241,-81.356276,...,190.3483,114.510458,-79.321841,-65.28425,-69.147191,-111.427223,18.650104,15.329342,43.115542,31.039919,49.603525,-26.932295,-54.213027,98.597791,-72.84859,87.272756,2.56324,-12.326506,-50.862417,19.058227,180.08569,-79.230389,-49.612403,-2.875509,12.640813,-205.569995,-46.432224,86.927369,2.640828,-2.350201,6.193171,-65.084229,-15.654534,-3.475828,26.999802,170.172613,-57.038953,194.269546,128.207803,55.490061
2,156.83363,59.991896,-8.414311,29.251426,133.680218,112.457566,89.849781,21.613022,24.331766,171.11449,-104.683288,-27.20206,-45.827002,-96.844532,-12.986917,43.818585,20.79344,15.393396,0.015373,-28.024086,-12.633978,11.869804,-52.47816,-3.826736,8.331102,21.353006,-36.579113,-14.109747,43.868416,-107.13366,125.65421,-0.800086,39.060021,-50.904144,37.683092,-78.781466,122.383833,-67.52498,-107.706957,-19.469239,...,13.483461,-63.411094,40.414527,37.97131,110.4636,-76.722634,-13.193802,38.792827,-49.570232,-142.726638,12.245693,-54.343166,-60.057482,102.482628,3.65634,-21.436032,37.960952,-25.729971,23.332562,9.404027,153.432356,-76.191487,-20.813622,-27.2691,11.735237,-20.692981,-47.580536,42.119569,49.291951,-59.756461,-26.185226,-19.283218,75.602438,24.144027,-91.874398,-178.454113,-91.471482,19.922719,21.26669,49.574858


###***Number of features in each of the DataFrames***

In [None]:
#publishing the shape details of different DataFrame obtained in a above cells
x = PrettyTable()
x.field_names = ["Name of Table", "Number of Features"]
x.add_row(['Nlp Dataframe',df1.shape[1]])
x.add_row(['Preprocessed Dataframe',df2.shape[1]])
x.add_row(['Question1 w2v  Dataframe',df3_q1.shape[1]])
x.add_row(['Question2 w2v  Dataframe',df3_q2.shape[1]])
x.add_row(['Final Dataframe',df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1]])
print(x)

+--------------------------+--------------------+
|      Name of Table       | Number of Features |
+--------------------------+--------------------+
|      Nlp Dataframe       |         28         |
|  Preprocessed Dataframe  |         12         |
| Question1 w2v  Dataframe |         96         |
| Question2 w2v  Dataframe |         96         |
|     Final Dataframe      |        232         |
+--------------------------+--------------------+


###***Creating the Final CSV file with all the features extracted***

In [None]:
#storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    print('Generating the Final Features.csv')
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')

Generating the Final Features.csv


###***-- End of Notebook 3 --***