In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("train.csv")
df.head(3)
df.shape

(404290, 6)

In [3]:
# Taking only 30000 Rows
new_df=df.sample(30000)
new_df.isnull().sum()
new_df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
354924,354924,484107,484108,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,0
142581,142581,101599,226089,How do I find my first investment property?,How do I find investment properties?,1


In [4]:
# Creating new Dataframe using only 'question1' and 'question2' columns
ques_df=new_df[['question1','question2']]
# ques_df.head()

In [5]:
# Dropping missing values
ques_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df.dropna(inplace=True)


In [6]:
# Applying preprocessing

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from gensim.utils import simple_preprocess
from collections import Counter

In [7]:
# Download

nltk.download('stopwords')
nltk.download('wordnet') #large lexical database of English
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Tools 

stop_words=set(stopwords.words('english'))
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [9]:
def preprocess_pipeline(text):
    if pd.isnull(text):
        return []
    # Lowercase + Tokenize + remove punctuation
    tokens = simple_preprocess(text)
    #Remove stopwords
    tokens=[word for word in tokens if word not in stop_words]
    # Replace numbers with "NUM"
    tokens=[re.sub(r'\d+','NUM',word)for word in tokens]
    # Apply stemming
    tokens=[stemmer.stem(word) for word in tokens]
    # Apply lemmatization
    tokens =[lemmatizer.lemmatize(words) for words in tokens]

    return tokens


In [10]:
# Apply preprocessing on both columns 
ques_df['q1_clean']=ques_df['question1'].apply(preprocess_pipeline)
ques_df['q2_clean']=ques_df['question2'].apply(preprocess_pipeline)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q1_clean']=ques_df['question1'].apply(preprocess_pipeline)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q2_clean']=ques_df['question2'].apply(preprocess_pipeline)


In [11]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
354924,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,"[ipod, touch, call, ipod]","[ipod, ipod, touch, differ]"
142581,How do I find my first investment property?,How do I find investment properties?,"[find, first, invest, properti]","[find, invest, properti]"
314075,What are the major contributions of malinowski...,How did Malinowski change anthropology?,"[major, contribut, malinowski, toward, anthrop...","[malinowski, chang, anthropolog]"


In [12]:
# Each word frequency
from collections import Counter
all_words=[word for tokens in ques_df['q1_clean']+ques_df['q2_clean']for word in tokens]
word_freq=Counter(all_words)

In [13]:
# word_freq

In [14]:
# Remove Rare and frequent words

def remove_extreme_words(tokens):
    filtered = []
    for word in tokens:
        if word_freq[word] > 2 and word_freq[word] < 10000:
            filtered.append(word)
    return filtered


In [15]:
# Applying Removal of extreme words
ques_df['q1_clean']=ques_df['q1_clean'].apply(remove_extreme_words)
ques_df['q2_clean']=ques_df['q2_clean'].apply(remove_extreme_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q1_clean']=ques_df['q1_clean'].apply(remove_extreme_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ques_df['q2_clean']=ques_df['q2_clean'].apply(remove_extreme_words)


In [16]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
354924,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,"[ipod, touch, call, ipod]","[ipod, ipod, touch, differ]"
142581,How do I find my first investment property?,How do I find investment properties?,"[find, first, invest, properti]","[find, invest, properti]"
314075,What are the major contributions of malinowski...,How did Malinowski change anthropology?,"[major, contribut, toward, anthropolog]","[chang, anthropolog]"


In [17]:
# Removing very short questions (less than 3 words)
q1_valid=ques_df['q1_clean'].apply(lambda x:len(x)>2)
q2_valid=ques_df['q2_clean'].apply(lambda x: len(x)>2)
ques_df=ques_df[q1_valid&q2_valid]
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean
354924,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,"[ipod, touch, call, ipod]","[ipod, ipod, touch, differ]"
142581,How do I find my first investment property?,How do I find investment properties?,"[find, first, invest, properti]","[find, invest, properti]"
316449,How would Indians and prospective Indian stude...,Will Donald Trump allow Indian students to stu...,"[would, indian, prospect, indian, student, wan...","[donald, trump, allow, indian, student, studi, u]"


In [18]:
ques_df.shape # number of rows reduced from 30000 to 24758

(24765, 4)

In [19]:
from gensim.models import Word2Vec

In [20]:
# Combine all cleaned questions as a single list
sentences=list(ques_df['q1_clean']+ques_df['q2_clean'])
# sentences

In [21]:
# Train Word2Vec
model =Word2Vec(sentences,vector_size=100,window=5,min_count=3,workers=4,sg=1,epochs=10)

In [22]:
# Number of Vocabulary
len(model.wv)

7457

In [23]:
# Vector representation of each word
# model.wv.get_normed_vectors()

In [24]:
# Total unique words whose vector representation was created 
model.wv.get_normed_vectors().shape

(7457, 100)

In [25]:
y=model.wv.index_to_key
# y

In [26]:
# Applying dimentionality Reduction using PCA 
# So that we van have a Visual representation in 3D
from sklearn.decomposition import PCA
pca =PCA(n_components=3)
x=pca.fit_transform(model.wv.get_normed_vectors())
x.shape # Dimentionality reduced from 100 to 3 

(7457, 3)

In [27]:
# 3D representation of Words
import plotly.express as px # 3D representation
fig=px.scatter_3d(x[:100],x=0,y=1,z=2,color=y[:100]) # ploting 100 words
fig.show()

In [28]:
# Getting vector Representaion of each Question
def question_vector(tokens,w2v_model):
    vectors=[]
    for word in tokens:
        if word in w2v_model.wv:
            vectors.append(w2v_model.wv[word])

    if len(vectors)==0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vectors,axis=0)

ques_df['q1_vec']=ques_df['q1_clean'].apply(lambda x: question_vector(x,model))
ques_df['q2_vec']=ques_df['q2_clean'].apply(lambda x: question_vector(x,model))

In [29]:
# Combine Question Vectors to Create Single Feature Vectors per row
# Using Concatenation - model has to learn differences or similarities implicitly from the two raw vectors.
# Applying absolute difference - provides the model with direct signals about how close or far the questions are in the embedding space
x_concat = np.hstack([np.vstack(ques_df['q1_vec'].values), np.vstack(ques_df['q2_vec'].values)])
x_diff = np.abs(np.vstack(ques_df['q1_vec'].values) - np.vstack(ques_df['q2_vec'].values))


In [30]:
# Combine both features to yield better results
x_features = np.hstack([x_concat, x_diff])

In [31]:
ques_df.head(2)

Unnamed: 0,question1,question2,q1_clean,q2_clean,q1_vec,q2_vec
354924,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,"[ipod, touch, call, ipod]","[ipod, ipod, touch, differ]","[0.19758877, 0.31965208, -0.1629177, 0.0244661...","[0.100949064, 0.23610143, -0.11664283, -0.0357..."
142581,How do I find my first investment property?,How do I find investment properties?,"[find, first, invest, properti]","[find, invest, properti]","[-0.12879634, 0.70947516, 0.20635833, 0.050394...","[-0.12576957, 0.8783118, 0.20483856, -0.127199..."


In [32]:
# Add 'is_duplicate' column to ques_df
ques_df['is_duplicate'] = df.loc[ques_df.index, 'is_duplicate']

In [33]:
ques_df.head(3)

Unnamed: 0,question1,question2,q1_clean,q2_clean,q1_vec,q2_vec,is_duplicate
354924,Why is the iPod touch called an iPod?,How are iPod and iPod Touch different?,"[ipod, touch, call, ipod]","[ipod, ipod, touch, differ]","[0.19758877, 0.31965208, -0.1629177, 0.0244661...","[0.100949064, 0.23610143, -0.11664283, -0.0357...",0
142581,How do I find my first investment property?,How do I find investment properties?,"[find, first, invest, properti]","[find, invest, properti]","[-0.12879634, 0.70947516, 0.20635833, 0.050394...","[-0.12576957, 0.8783118, 0.20483856, -0.127199...",1
316449,How would Indians and prospective Indian stude...,Will Donald Trump allow Indian students to stu...,"[would, indian, prospect, indian, student, wan...","[donald, trump, allow, indian, student, studi, u]","[-0.07911031, 0.012128361, 0.046152297, 0.0174...","[-0.12753262, -0.018740859, 0.16702187, 0.0710...",0


In [34]:
y_labels=ques_df['is_duplicate'].values

In [35]:
# Train Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [36]:
x_train,x_test,y_train,y_test=train_test_split(x_features,y_labels,test_size=0.2,random_state=2)

In [37]:
rf=RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.7655966081162932