In [79]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [80]:
import warnings
warnings.filterwarnings("ignore")

In [81]:
os.chdir("C:/Users/Shardul Janaskar/Downloads")

In [82]:
df = pd.read_csv("quora_duplicate_questions.tsv", delimiter = '\t', encoding = 'utf-8')

In [83]:
df = df.dropna()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

questions = list(df["question1"]) + list(df["question2"])

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [7]:
from collections import Counter

train_questions = pd.Series(df['question1'].tolist() + df['question2'].tolist())

def get_weight(count, eps = 5000, min_count = 2):
    if count< min_count:
        return 0
    else:
        R = 1.0/(count + eps)
        return R

words = (" ".join(train_questions)).lower().split()    
counts = Counter(words)
weights = {word : get_weight(count) for word, count in counts.items()}

    

In [8]:
def tfidf_word_share_norm(x):
    w1 = set(map(lambda word: word.lower().strip(), str(x['question1']).split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), str(x['question2']).split(" "))) 
    if len(w1)==0 or len(w2)==0:
        return 0
    common = w1 & w2
    share_weight = [weights.get(word, 0) for word in common]
    total_weight = [weights.get(word, 0) for word in w1]+[weights.get(word, 0) for word in w2]
    return np.sum(share_weight)/np.sum(total_weight)

In [9]:
df['TFIDF_share'] = df.apply(tfidf_word_share_norm, axis=1, raw=True)

In [10]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,TFIDF_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.400611
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.193056
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.149204
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.009949


In [84]:
def normalized_word_share(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" "))) 
    return len(w1 & w2)/(len(w1) + len(w2))
df['word_share'] = df.apply(normalized_word_share, axis=1)
df['len_1'] = df.question1.apply(lambda x : len(str(x)))
df['len_2'] = df.question2.apply(lambda x : len(str(x)))
df['len_word_1'] = df.question1.apply(lambda x : len(str(x).split()))
df['len_word_2'] = df.question2.apply(lambda x : len(str(x).split()))

In [85]:
df1 = df.copy()

In [86]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,len_1,len_2,len_word_1,len_word_2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.434783,66,57,14,12
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.2,51,88,8,13
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.166667,73,59,14,10
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,50,65,11,9
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.1,76,39,13,7


In [94]:
doc1 = df['question1'].tolist()
len(doc1)

404287

In [None]:
import re
for i in range(len(doc1)):
    df1['question1'][i] = re.sub(r'[^a-zA-Z\s]', '',str(doc1))

In [91]:
df1.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,len_1,len_2,len_word_1,len_word_2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.434783,66,57,14,12
1,1,3,4,What is the step by step guide to invest in sh...,What would happen if the Indian government sto...,0,0.2,51,88,8,13
2,2,5,6,What is the step by step guide to invest in sh...,How can Internet speed be increased by hacking...,0,0.166667,73,59,14,10
3,3,7,8,What is the step by step guide to invest in sh...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,50,65,11,9
4,4,9,10,What is the step by step guide to invest in sh...,Which fish would survive in salt water?,0,0.1,76,39,13,7


In [13]:
df1 = df1.drop(['qid1','qid2','question1','question2'],axis=1)

In [14]:
df_2 = pd.read_pickle('quora_preprocess.pkl')

In [15]:
df_2['TFIDF_share'] = df1['TFIDF_share']

In [16]:
df_2_copy = df_2.copy()

In [17]:
df_2 = df_2.drop(['id','qid1', 'question1','question2', 'is_duplicate'], 1)

In [18]:
def tokenize(content):
    return content.split()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, analyzer='word', max_df=1.0, min_df=1)
cosine_vals = []

for i in df.id:
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform([df.loc[i]['question1'], df.loc[i]['question2']])
        cosine_vals.append(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1])
    except:
        cosine_vals.append(0)

In [19]:
df_cos = pd.DataFrame({'col':cosine_vals})

In [26]:
len(cosine_vals)

404287

In [29]:
df_2 = df_2[:404287]

In [30]:
df_2['similarity'] = cosine_vals

In [32]:
df_2['is_duplicate'] = df1['is_duplicate']

In [35]:
df['is_duplicate'].value_counts()

0    255024
1    149263
Name: is_duplicate, dtype: int64

In [37]:
df = df[:404287]

In [38]:
df['similarity'] = cosine_vals

In [40]:
df = df.drop(['TFIDF_share'],1)

In [41]:
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wnl= WordNetLemmatizer()
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', str(doc))
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    #Lemmetization
    lem_tokens = [wnl.lemmatize(word, pos="v") for word in filtered_tokens]
    # re-create document from filtered tokens
    doc = ' '.join(lem_tokens)
    return doc

In [42]:
text1 = df['question1']

In [45]:
text2 = df['question2']

In [43]:
norm_corp = np.vectorize(text1)

In [44]:
corp = normalize_document(norm_corp)

In [46]:
norm_corp2 = np.vectorize(text2)

In [47]:
corp2 = normalize_document(norm_corp2)

In [57]:
df['question1'] = norm_corp

In [58]:
df['question2'] = norm_corp2

In [59]:
X = df.drop(['id','qid1','qid2','is_duplicate'],1)

In [60]:
y = df['is_duplicate']

In [61]:
from sklearn.model_selection import train_test_split as tts

In [63]:
X_train, X_test, y_train, y_test = tts(X,y,test_size=0.3, random_state=42)

In [62]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,len_1,len_2,len_word_1,len_word_2,similarity
0,0,1,2,<numpy.lib.function_base.vectorize object at 0...,<numpy.lib.function_base.vectorize object at 0...,0,0.434783,66,57,14,12,0.809955
1,1,3,4,<numpy.lib.function_base.vectorize object at 0...,<numpy.lib.function_base.vectorize object at 0...,0,0.2,51,88,8,13,0.303515
2,2,5,6,<numpy.lib.function_base.vectorize object at 0...,<numpy.lib.function_base.vectorize object at 0...,0,0.166667,73,59,14,10,0.206137
3,3,7,8,<numpy.lib.function_base.vectorize object at 0...,<numpy.lib.function_base.vectorize object at 0...,0,0.0,50,65,11,9,0.0
4,4,9,10,<numpy.lib.function_base.vectorize object at 0...,<numpy.lib.function_base.vectorize object at 0...,0,0.1,76,39,13,7,0.119137


In [64]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [65]:
cv = CountVectorizer()

numpylibfunctionbasevectorize object xea


In [74]:
XTrain1 = cv.fit_transform(X_train)

In [75]:
XTest = cv.transform(X_test)

In [76]:
from sklearn.naive_bayes import MultinomialNB

In [77]:
nb = MultinomialNB()

In [78]:
nb.fit(XTrain1,y_train)

ValueError: Found input variables with inconsistent numbers of samples: [8, 283000]