In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("train.csv")
df.head(3)
df.shape

(404290, 6)

In [3]:
# Taking only 30000 Rows
new_df=df.sample(30000)
new_df.isnull().sum()
new_df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
253664,253664,368248,324013,What are the best government jobs in India?,What is the best government job in India?,1
62511,62511,108937,108938,Do you think that there will be a Third World ...,Should there be a Third World War?,0


In [4]:
# Creating new Dataframe using only 'question1' and 'question2' columns
ques_df=new_df[['question1','question2']]
# ques_df.head()

In [5]:
# Dropping missing values
ques_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df.dropna(inplace=True)


In [6]:
# Applying preprocessing

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from gensim.utils import simple_preprocess
from collections import Counter

In [7]:
# Download

nltk.download('stopwords')
nltk.download('wordnet') #large lexical database of English
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Tools 

stop_words=set(stopwords.words('english'))
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [9]:
def preprocess_pipeline(text):
    if pd.isnull(text):
        return []
    # Lowercase + Tokenize + remove punctuation
    tokens = simple_preprocess(text)
    #Remove stopwords
    tokens=[word for word in tokens if word not in stop_words]
    # Replace numbers with "NUM"
    tokens=[re.sub(r'\d+','NUM',word)for word in tokens]
    # Apply stemming
    tokens=[stemmer.stem(word) for word in tokens]
    # Apply lemmatization
    tokens =[lemmatizer.lemmatize(words) for words in tokens]

    return tokens


In [10]:
# Apply preprocessing on both columns 
ques_df['q1_clean']=ques_df['question1'].apply(preprocess_pipeline)
ques_df['q2_clean']=ques_df['question2'].apply(preprocess_pipeline)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q1_clean']=ques_df['question1'].apply(preprocess_pipeline)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q2_clean']=ques_df['question2'].apply(preprocess_pipeline)


In [11]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
253664,What are the best government jobs in India?,What is the best government job in India?,"[best, govern, job, india]","[best, govern, job, india]"
62511,Do you think that there will be a Third World ...,Should there be a Third World War?,"[think, third, world, war]","[third, world, war]"
69472,Is it safe to send mobile phones across the co...,Can we send mobile phones from USA to India?,"[safe, send, mobil, phone, across, countri, us...","[send, mobil, phone, usa, india]"


In [12]:
# Each word frequency
from collections import Counter
all_words=[word for tokens in ques_df['q1_clean']+ques_df['q2_clean']for word in tokens]
word_freq=Counter(all_words)

In [13]:
word_freq

Counter({'best': 5323,
         'get': 3292,
         'like': 2174,
         'india': 2090,
         'use': 2011,
         'peopl': 1960,
         'way': 1879,
         'good': 1878,
         'differ': 1822,
         'make': 1764,
         'would': 1745,
         'one': 1645,
         'learn': 1356,
         'quora': 1355,
         'time': 1247,
         'life': 1214,
         'work': 1145,
         'money': 1096,
         'know': 1096,
         'year': 1054,
         'think': 976,
         'question': 959,
         'someon': 955,
         'thing': 954,
         'engin': 935,
         'mean': 932,
         'new': 932,
         'go': 929,
         'much': 919,
         'indian': 898,
         'find': 891,
         'book': 880,
         'becom': 865,
         'start': 857,
         'job': 851,
         'movi': 838,
         'ever': 833,
         'mani': 818,
         'day': 813,
         'trump': 801,
         'want': 799,
         'world': 787,
         'happen': 775,
         'person':

In [14]:
# Remove Rare and frequent words

def remove_extreme_words(tokens):
    filtered = []
    for word in tokens:
        if word_freq[word] > 2 and word_freq[word] < 10000:
            filtered.append(word)
    return filtered


In [15]:
# Applying Removal of extreme words
ques_df['q1_clean']=ques_df['q1_clean'].apply(remove_extreme_words)
ques_df['q2_clean']=ques_df['q2_clean'].apply(remove_extreme_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q1_clean']=ques_df['q1_clean'].apply(remove_extreme_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q2_clean']=ques_df['q2_clean'].apply(remove_extreme_words)


In [16]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
253664,What are the best government jobs in India?,What is the best government job in India?,"[best, govern, job, india]","[best, govern, job, india]"
62511,Do you think that there will be a Third World ...,Should there be a Third World War?,"[think, third, world, war]","[third, world, war]"
69472,Is it safe to send mobile phones across the co...,Can we send mobile phones from USA to India?,"[safe, send, mobil, phone, across, countri, us...","[send, mobil, phone, usa, india]"


In [17]:
# Removing very short questions (less than 3 words)
q1_valid=ques_df['q1_clean'].apply(lambda x:len(x)>2)
q2_valid=ques_df['q2_clean'].apply(lambda x: len(x)>2)
ques_df=ques_df[q1_valid&q2_valid]
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
253664,What are the best government jobs in India?,What is the best government job in India?,"[best, govern, job, india]","[best, govern, job, india]"
62511,Do you think that there will be a Third World ...,Should there be a Third World War?,"[think, third, world, war]","[third, world, war]"
69472,Is it safe to send mobile phones across the co...,Can we send mobile phones from USA to India?,"[safe, send, mobil, phone, across, countri, us...","[send, mobil, phone, usa, india]"


In [18]:
ques_df.shape # number of rows reduced from 30000 to 24758

(24768, 4)

In [19]:
from gensim.models import Word2Vec

In [20]:
# Combine all cleaned questions as a single list
sentences=list(ques_df['q1_clean']+ques_df['q2_clean'])
sentences

[['best', 'govern', 'job', 'india', 'best', 'govern', 'job', 'india'],
 ['think', 'third', 'world', 'war', 'third', 'world', 'war'],
 ['safe',
  'send',
  'mobil',
  'phone',
  'across',
  'countri',
  'use',
  'indian',
  'speed',
  'post',
  'send',
  'mobil',
  'phone',
  'usa',
  'india'],
 ['india', 'nepal', 'end', 'differ', 'type', 'psycholog', 'major'],
 ['ethernet', 'cabl', 'length', 'limit', 'ft', 'hdmi', 'cabl', 'ethernet'],
 ['end',
  'stage',
  'patient',
  'stop',
  'eat',
  'eat',
  'earli',
  'stage',
  'pregnanc'],
 ['war',
  'break',
  'india',
  'pakistan',
  'would',
  'win',
  'win',
  'war',
  'india',
  'pakistan',
  'go',
  'round',
  'indigen'],
 ['non',
  'tech',
  'busi',
  'idea',
  'mind',
  'share',
  'idea',
  'compani',
  'like',
  'appl',
  'googl',
  'share',
  'publicli'],
 ['chang', 'yahoo', 'password', 'chang', 'password', 'yahoo', 'mail'],
 ['android',
  'app',
  'similar',
  'platform',
  'similar',
  'configur',
  'develop',
  'android',
  'app'],

In [21]:
# Train Word2Vec
model =Word2Vec(sentences,vector_size=100,window=5,min_count=3,workers=4,sg=1,epochs=10)

In [22]:
# Number of Vocabulary
len(model.wv)

7501

In [23]:
# Vector representation of each word
model.wv.get_normed_vectors()

array([[ 0.06447784,  0.17990561, -0.04946036, ..., -0.22562684,
         0.05567097, -0.04871101],
       [-0.10058314, -0.02995117,  0.08447422, ..., -0.03564782,
         0.04570922, -0.00053859],
       [-0.08364807,  0.00344727,  0.0014339 , ...,  0.02590498,
         0.03389815, -0.12273193],
       ...,
       [-0.0764507 ,  0.21295878, -0.06664322, ...,  0.08533933,
         0.08892771, -0.02719401],
       [-0.06477034,  0.13549906,  0.06561577, ..., -0.01193024,
         0.04522059,  0.16964988],
       [-0.05621161,  0.12002341, -0.05459228, ..., -0.01899295,
        -0.00881455,  0.11660021]], dtype=float32)

In [24]:
# Total unique words whose vector representation was created 
model.wv.get_normed_vectors().shape

(7501, 100)

In [25]:
y=model.wv.index_to_key
y

['best',
 'get',
 'india',
 'like',
 'use',
 'peopl',
 'good',
 'way',
 'would',
 'make',
 'differ',
 'one',
 'quora',
 'learn',
 'time',
 'money',
 'year',
 'life',
 'work',
 'know',
 'question',
 'new',
 'someon',
 'go',
 'engin',
 'thing',
 'indian',
 'much',
 'think',
 'book',
 'find',
 'job',
 'start',
 'ever',
 'day',
 'mani',
 'trump',
 'want',
 'movi',
 'without',
 'world',
 'onlin',
 'becom',
 'person',
 'happen',
 'first',
 'better',
 'mean',
 'take',
 'account',
 'feel',
 'note',
 'girl',
 'possibl',
 'compani',
 'need',
 'u',
 'love',
 'univers',
 'english',
 'phone',
 'ask',
 'could',
 'live',
 'woman',
 'program',
 'chang',
 'number',
 'compar',
 'student',
 'countri',
 'answer',
 'websit',
 'app',
 'long',
 'weight',
 'donald',
 'facebook',
 'buy',
 'busi',
 'improv',
 'prepar',
 'friend',
 'languag',
 'stop',
 'googl',
 'old',
 'realli',
 'lose',
 'place',
 'import',
 'effect',
 'look',
 'war',
 'colleg',
 'see',
 'develop',
 'state',
 'servic',
 'game',
 'help',
 'come

In [26]:
# Applying dimentionality Reduction using PCA 
# So that we van have a Visual representation in 3D
from sklearn.decomposition import PCA
pca =PCA(n_components=3)
x=pca.fit_transform(model.wv.get_normed_vectors())
x.shape # Dimentionality reduced from 100 to 3 

(7501, 3)

In [27]:
# 3D representation of Words
import plotly.express as px # 3D representation
fig=px.scatter_3d(x[:100],x=0,y=1,z=2,color=y[:100]) # ploting 100 words
fig.show()

In [28]:
# Getting vector Representaion of each Question
def question_vector(tokens,w2v_model):
    vectors=[]
    for word in tokens:
        if word in w2v_model.wv:
            vectors.append(w2v_model.wv[word])

    if len(vectors)==0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vectors,axis=0)

ques_df['q1_vec']=ques_df['q1_clean'].apply(lambda x: question_vector(x,model))
ques_df['q2_vec']=ques_df['q2_clean'].apply(lambda x: question_vector(x,model))

In [29]:
# Combine Question Vectors to Create Single Feature Vectors per row
# Using Concatenation - model has to learn differences or similarities implicitly from the two raw vectors.
# Applying absolute difference - provides the model with direct signals about how close or far the questions are in the embedding space
x_concat = np.hstack([np.vstack(ques_df['q1_vec'].values), np.vstack(ques_df['q2_vec'].values)])
x_diff = np.abs(np.vstack(ques_df['q1_vec'].values) - np.vstack(ques_df['q2_vec'].values))


In [30]:
# Combine both features to yield better results
x_features = np.hstack([x_concat, x_diff])

In [31]:
ques_df.head(2)

Unnamed: 0,question1,question2,q1_clean,q2_clean,q1_vec,q2_vec
253664,What are the best government jobs in India?,What is the best government job in India?,"[best, govern, job, india]","[best, govern, job, india]","[-0.08288953, 0.08016958, 0.085313685, 0.22732...","[-0.08288953, 0.08016958, 0.085313685, 0.22732..."
62511,Do you think that there will be a Third World ...,Should there be a Third World War?,"[think, third, world, war]","[third, world, war]","[-0.23391633, 0.43869343, -0.22121167, -0.1422...","[-0.3439621, 0.4635798, -0.14671461, -0.294193..."


In [32]:
# Add 'is_duplicate' column to ques_df
ques_df['is_duplicate'] = df.loc[ques_df.index, 'is_duplicate']

In [33]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean,q1_vec,q2_vec,is_duplicate
253664,What are the best government jobs in India?,What is the best government job in India?,"[best, govern, job, india]","[best, govern, job, india]","[-0.08288953, 0.08016958, 0.085313685, 0.22732...","[-0.08288953, 0.08016958, 0.085313685, 0.22732...",1
62511,Do you think that there will be a Third World ...,Should there be a Third World War?,"[think, third, world, war]","[third, world, war]","[-0.23391633, 0.43869343, -0.22121167, -0.1422...","[-0.3439621, 0.4635798, -0.14671461, -0.294193...",0
69472,Is it safe to send mobile phones across the co...,Can we send mobile phones from USA to India?,"[safe, send, mobil, phone, across, countri, us...","[send, mobil, phone, usa, india]","[-0.09957007, 0.25910026, -0.09231268, 0.00702...","[0.026430214, 0.0033171936, -0.19747156, 0.139...",0


In [34]:
y_labels=ques_df['is_duplicate'].values

In [35]:
# Train Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [36]:
x_train,x_test,y_train,y_test=train_test_split(x_features,y_labels,test_size=0.2,random_state=2)

In [37]:
rf=RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.7555510698425515