# 5. Featurizing text data with tfidf weighted word-vectors

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import time
from fuzzywuzzy import fuzz
# Import the Required lib packages for WORD-Cloud generation
# https://stackoverflow.com/questions/45625434/how-to-install-wordcloud-in-python3-6
from wordcloud import WordCloud, STOPWORDS
from os import path
from PIL import Image

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os 
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy



In [2]:
data1 = pd.read_csv('E:\\ML\\IPYTHON\\Module 6\\Chapter 1 Case study 1- Quora ques\\41.2 Bussiness, Real world problem-problem def\\train.csv')

data1['question1'] = data1['question1'].apply(lambda x: str(x))
data1['question2'] = data1['question2'].apply(lambda x: str(x))

In [3]:
data1.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# merge texts
questions = list(data1['question1']) + list(data1['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [5]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(data1['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
data1['q1_feats_m'] = list(vecs1)

100%|██████████| 404290/404290 [1:11:39<00:00, 94.03it/s] 


In [7]:
vecs2 = []
for qu2 in tqdm(list(data1['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
data1['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [1:12:35<00:00, 92.82it/s] 


In [8]:
data1.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats_m,q2_feats_m
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[79.07837373018265, 15.782018959522247, 37.059...","[65.80132514238358, 15.163416892290115, 28.238..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[18.990903168916702, 48.39012713730335, 14.231...","[21.181431472301483, 44.1483453810215, -5.6844..."


In [9]:
data1.shape

(404290, 8)

In [11]:
dfnlp = pd.read_pickle('adv_fe_data')

In [12]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df3 = data1.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [13]:
df3_q1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,79.078374,15.782019,37.05994,-28.544872,4.867482,16.195759,-23.889907,19.217555,45.637698,-44.84445,...,-15.927717,-34.502122,-37.652304,-24.222525,-6.814602,1.565476,0.176568,-12.006138,-13.089401,-2.580176
1,18.990903,48.390127,14.231475,-12.000782,-2.324469,-20.050934,-16.054571,-15.817222,3.254204,-39.863552,...,-1.317567,-34.506608,-42.9343,-23.459032,1.949568,45.108892,43.110111,-36.80386,-8.712242,-22.469977


In [14]:
df3_q2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,65.801325,15.163417,28.238274,-22.443842,-1.128907,14.04498,-19.552754,23.298152,33.598356,-34.684082,...,-13.24054,-35.406901,-33.057641,-22.903015,-6.279673,-3.470418,-12.132645,-8.762915,-16.139727,-8.468419
1,21.181431,44.148345,-5.684423,-28.517999,-30.621333,7.486887,-16.820571,3.151194,11.878593,-13.489614,...,-37.242324,-31.066498,-45.401839,-23.039135,-5.305946,19.393062,16.864331,-17.193637,19.457991,-29.883955


In [22]:
dfnlp.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,1,1,66,57,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759


In [20]:
df1.head(2)

Unnamed: 0,id,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,1,1,66,57,14,12,10.0,23.0,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,4,1,51,88,8,13,4.0,20.0,...,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


In [17]:
df3_q1['id']=df1['id']
df3_q2['id']=df1['id']

In [23]:
df2  = df3_q1.merge(df3_q2, on='id',how='left')
    
result  = df1.merge(df2, on='id',how='left')

In [24]:
result.shape

(404287, 220)

In [25]:
result.head(2)

Unnamed: 0,id,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,0,0,1,1,66,57,14,12,10.0,23.0,...,-13.24054,-35.406901,-33.057641,-22.903015,-6.279673,-3.470418,-12.132645,-8.762915,-16.139727,-8.468419
1,1,0,4,1,51,88,8,13,4.0,20.0,...,-37.242324,-31.066498,-45.401839,-23.039135,-5.305946,19.393062,16.864331,-17.193637,19.457991,-29.883955


In [28]:
result.to_csv('ready_to_model')