<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [5]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████| 404290/404290 [1:11:50<00:00, 93.78it/s] 


In [6]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [1:11:39<00:00, 94.04it/s] 


In [5]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [7]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
#df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
#df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [8]:
# dataframe of nlp features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,28,24,9,14,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,35,56,0.175


In [9]:
# data before preprocessing 
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.0,2,0
4,4,3,1,76,39,13,7,2.0,20.0,0.1,4,2


In [10]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

NameError: name 'df3_q1' is not defined

In [11]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

NameError: name 'df3_q2' is not defined

In [14]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
#print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
#print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1])#+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in final dataframe  : 29


In [16]:
df1

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.999980,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667
3,3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2.0,12.0,28,24,9,14,0.039216
4,4,0,0.399992,0.199998,0.999950,0.666644,0.571420,0.307690,0.0,1.0,6.0,10.0,67,47,35,56,0.175000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,0,0.857131,0.857131,0.999980,0.833319,0.846147,0.785709,1.0,1.0,1.0,13.5,97,88,91,87,0.387500
404286,404286,1,0.666644,0.666644,0.599988,0.599988,0.624992,0.555549,1.0,0.0,1.0,8.5,79,69,72,76,0.642857
404287,404287,0,0.999900,0.499975,0.999950,0.666644,0.749981,0.749981,1.0,1.0,0.0,4.0,86,79,79,76,0.444444
404288,404288,0,0.000000,0.000000,0.124998,0.099999,0.058823,0.040000,0.0,0.0,8.0,21.0,37,35,30,34,0.052632


In [24]:
df2

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.200000,5,3
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.000000,2,0
4,4,3,1,76,39,13,7,2.0,20.0,0.100000,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,2,2,85,79,14,13,11.0,25.0,0.440000,4,0
404286,404286,12,1,41,42,8,9,5.0,16.0,0.312500,13,11
404287,404287,1,1,17,17,4,3,1.0,7.0,0.142857,2,0
404288,404288,1,1,94,127,17,25,1.0,40.0,0.025000,2,0


In [18]:
# storing the final features to csv file
if not os.path.isfile('ffinal_features.csv'):
    #df3_q1['id']=df1['id']
    #df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    #df2  = df3_q1.merge(df3_q2, on='id',how='left')
    #result  = df1.merge(df2, on='id',how='left')
    df1.to_csv('ffinal_features.csv')

In [15]:
'''# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')'''

In [19]:
ff=pd.read_csv('final_features.csv')

In [20]:
ff

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,0,0,0,0.999980,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,...,-72.266625,-37.072086,-31.142730,94.064854,-45.053242,-34.155221,-76.548099,99.282776,50.791731,-17.566246
1,1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,...,6.193171,-65.084229,-15.654534,-3.475828,26.999802,170.172613,-57.038953,194.269546,128.207803,55.490061
2,2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,...,-26.185226,-19.283218,75.602438,24.144027,-91.874398,-178.454113,-91.471482,19.922719,21.266690,49.574858
3,3,3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,-17.779019,30.152297,49.300137,27.783795,25.937188,-32.107076,-3.817634,-14.231000,4.772115,7.711628
4,4,4,0,0.399992,0.199998,0.999950,0.666644,0.571420,0.307690,0.0,...,36.089472,47.193216,-49.969586,44.796028,39.740803,-33.763309,-98.282341,22.118795,68.802072,21.025373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,404285,0,0.857131,0.857131,0.999980,0.833319,0.846147,0.785709,1.0,...,-1.453021,-60.300123,-74.572281,72.555646,17.551394,23.337234,88.163201,32.652129,139.499952,-13.953555
404286,404286,404286,1,0.666644,0.666644,0.599988,0.599988,0.624992,0.555549,1.0,...,-10.365770,2.831673,-35.364427,29.808441,-71.090092,-30.439194,-29.243593,41.660253,76.917245,-4.642743
404287,404287,404287,0,0.999900,0.499975,0.999950,0.666644,0.749981,0.749981,1.0,...,0.533134,13.967822,16.657529,-9.362394,-15.079316,-28.031886,18.059320,15.880986,54.864010,21.838973
404288,404288,404288,0,0.000000,0.000000,0.124998,0.099999,0.058823,0.040000,0.0,...,8.017589,-4.186200,-20.924122,130.719102,-138.362690,-129.146722,-184.949646,181.573884,83.035827,35.530677


In [21]:
fff=pd.read_csv('ffinal_features.csv')

In [23]:
fff

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,0,0,0.999980,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,...,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,...,1,51,88,8,13,4.0,20.0,0.200000,5,3
2,2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,...,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,1,50,65,11,9,0.0,19.0,0.000000,2,0
4,4,4,0,0.399992,0.199998,0.999950,0.666644,0.571420,0.307690,0.0,...,1,76,39,13,7,2.0,20.0,0.100000,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,404285,0,0.857131,0.857131,0.999980,0.833319,0.846147,0.785709,1.0,...,2,85,79,14,13,11.0,25.0,0.440000,4,0
404286,404286,404286,1,0.666644,0.666644,0.599988,0.599988,0.624992,0.555549,1.0,...,1,41,42,8,9,5.0,16.0,0.312500,13,11
404287,404287,404287,0,0.999900,0.499975,0.999950,0.666644,0.749981,0.749981,1.0,...,1,17,17,4,3,1.0,7.0,0.142857,2,0
404288,404288,404288,0,0.000000,0.000000,0.124998,0.099999,0.058823,0.040000,0.0,...,1,94,127,17,25,1.0,40.0,0.025000,2,0


In [26]:
ff.columns[0:17]

Index(['Unnamed: 0', 'id', 'is_duplicate', 'cwc_min', 'cwc_max', 'csc_min',
       'csc_max', 'ctc_min', 'ctc_max', 'last_word_eq', 'first_word_eq',
       'abs_len_diff', 'mean_len', 'token_set_ratio', 'token_sort_ratio',
       'fuzz_ratio', 'fuzz_partial_ratio'],
      dtype='object')

In [33]:
ff.columns[17:29]

Index(['longest_substr_ratio', 'freq_qid1', 'freq_qid2', 'q1len', 'q2len',
       'q1_n_words', 'q2_n_words', 'word_Common', 'word_Total', 'word_share',
       'freq_q1+q2', 'freq_q1-q2'],
      dtype='object')

In [37]:
ff.columns[29:125]

Index(['0_x', '1_x', '2_x', '3_x', '4_x', '5_x', '6_x', '7_x', '8_x', '9_x',
       '10_x', '11_x', '12_x', '13_x', '14_x', '15_x', '16_x', '17_x', '18_x',
       '19_x', '20_x', '21_x', '22_x', '23_x', '24_x', '25_x', '26_x', '27_x',
       '28_x', '29_x', '30_x', '31_x', '32_x', '33_x', '34_x', '35_x', '36_x',
       '37_x', '38_x', '39_x', '40_x', '41_x', '42_x', '43_x', '44_x', '45_x',
       '46_x', '47_x', '48_x', '49_x', '50_x', '51_x', '52_x', '53_x', '54_x',
       '55_x', '56_x', '57_x', '58_x', '59_x', '60_x', '61_x', '62_x', '63_x',
       '64_x', '65_x', '66_x', '67_x', '68_x', '69_x', '70_x', '71_x', '72_x',
       '73_x', '74_x', '75_x', '76_x', '77_x', '78_x', '79_x', '80_x', '81_x',
       '82_x', '83_x', '84_x', '85_x', '86_x', '87_x', '88_x', '89_x', '90_x',
       '91_x', '92_x', '93_x', '94_x', '95_x'],
      dtype='object')

In [47]:
ff.columns[125:221]

Index(['0_y', '1_y', '2_y', '3_y', '4_y', '5_y', '6_y', '7_y', '8_y', '9_y',
       '10_y', '11_y', '12_y', '13_y', '14_y', '15_y', '16_y', '17_y', '18_y',
       '19_y', '20_y', '21_y', '22_y', '23_y', '24_y', '25_y', '26_y', '27_y',
       '28_y', '29_y', '30_y', '31_y', '32_y', '33_y', '34_y', '35_y', '36_y',
       '37_y', '38_y', '39_y', '40_y', '41_y', '42_y', '43_y', '44_y', '45_y',
       '46_y', '47_y', '48_y', '49_y', '50_y', '51_y', '52_y', '53_y', '54_y',
       '55_y', '56_y', '57_y', '58_y', '59_y', '60_y', '61_y', '62_y', '63_y',
       '64_y', '65_y', '66_y', '67_y', '68_y', '69_y', '70_y', '71_y', '72_y',
       '73_y', '74_y', '75_y', '76_y', '77_y', '78_y', '79_y', '80_y', '81_y',
       '82_y', '83_y', '84_y', '85_y', '86_y', '87_y', '88_y', '89_y', '90_y',
       '91_y', '92_y', '93_y', '94_y', '95_y'],
      dtype='object')

In [66]:
len(questions)

808580

In [67]:
tfidf1 = TfidfVectorizer(lowercase=False, )
qt1=tfidf1.fit(questions)

In [69]:
qt1=tfidf1.transform(list(df['question1']))

In [71]:
tfidf2 = TfidfVectorizer(lowercase=False, )
qt2=tfidf2.fit(questions)

In [72]:
qt2=tfidf2.transform(list(df['question2']))

In [None]:
qt1.todense()

In [78]:
df3t_q1 = pd.DataFrame(qt1.todense(), index= df3.index)
df3t_q2 = pd.DataFrame(qt2.todense(), index= df3.index)

In [79]:
df3_q1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,id
0,-6.179507,37.450731,-67.929894,32.224274,143.348826,135.374574,17.865208,54.562352,81.618936,232.909839,...,-60.222858,-22.026407,103.336720,-68.477445,-54.976584,-67.802663,116.269999,60.515897,-12.245916,0
1,9.236668,-80.371416,-45.785907,78.291656,183.568221,100.894077,74.344804,48.360802,127.297421,112.987302,...,-98.080325,19.113790,-20.507508,-76.981011,82.665075,41.085582,129.377781,115.868467,4.383543,1
2,97.546829,22.972195,-39.558378,18.723416,56.928620,48.307643,8.719268,36.893737,106.899948,226.283080,...,87.592131,4.032431,56.851709,-43.625410,-57.580963,-50.425829,78.591986,105.714348,-33.304161,2
3,57.586999,-22.017088,-4.599304,-88.939273,-4.732172,-54.209038,74.614942,106.533731,15.520623,39.009711,...,41.981221,-11.204984,16.833434,-36.372471,8.927573,-64.553194,95.054238,-34.157566,70.821932,3
4,83.185784,-40.506985,-83.403923,-52.648658,79.074884,-19.038248,53.728722,97.648612,160.555822,290.541356,...,109.604406,-91.160167,-25.739913,133.123058,-13.508816,-100.115211,208.424382,286.930889,68.027638,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,35.207216,-7.444454,-3.649094,92.158139,67.040696,46.545180,3.312020,-53.295562,33.575936,189.809921,...,0.890766,-39.951883,77.872958,17.322343,-13.564594,98.458753,67.328341,128.440334,48.533939,404285
404286,-16.848597,23.561238,20.006477,-60.293218,59.374161,40.213185,74.267440,47.112013,49.119540,110.631131,...,-28.796754,-36.737937,20.891737,-32.772024,-20.421748,-19.607643,40.217637,40.097715,-5.651059,404286
404287,8.987206,-14.396783,2.565405,3.296160,15.090293,13.079559,21.007719,14.718808,9.194591,19.563914,...,-3.478772,-16.427240,-24.100426,-26.393882,-41.810941,-9.529665,-10.434779,37.024693,13.106087,404287
404288,134.256928,-63.004870,-62.629736,108.006606,162.527272,151.070462,49.249909,-69.132973,51.351415,197.952878,...,10.851074,-63.608021,17.749746,-42.449423,-49.475966,37.571248,141.158775,160.326508,107.145866,404288


In [80]:
df3t_q1

Unnamed: 0,0
0,"(0, 103601)\t0.09954585521469732\n (0, 1030..."
1,"(0, 103025)\t0.07944326155338896\n (0, 1006..."
2,"(0, 107880)\t0.30756057668348236\n (0, 1061..."
3,"(0, 106679)\t0.36633877104491847\n (0, 9933..."
4,"(0, 107469)\t0.20590509446672156\n (0, 1012..."
...,...
404285,"(0, 106661)\t0.3403087952708199\n (0, 10311..."
404286,"(0, 108883)\t0.23028966139177778\n (0, 1031..."
404287,"(0, 87678)\t0.42463431043483857\n (0, 79172..."
404288,"(0, 107880)\t0.22679689555457916\n (0, 1030..."
