In [None]:
from tqdm import tqdm
import re

import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt') 

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# load dataset
# collab drive mount
from google.colab import drive
drive.mount('/content/drive')

# loading previos feature extraction dataset
data = pd.read_csv("/content/drive/MyDrive/collab_data/Quora/advance_feature_extraction_train(2).csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,12,26,0.461538,2,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,1.0,1.0,2.0,13.0,0.965517


In [None]:
# pre processdataset (not stemming or lemitization required)

STOP_WORDS = stopwords.words("english")

# pre process text data 
def preprocess(sentence):
  # lowercase
  sentence = str(sentence).lower()
  # remove html tags
  cleantext = BeautifulSoup(sentence, "lxml").text
  # remove urls
  cleantext = re.sub(r"http\S+", "", cleantext)
  # remove special characters
  cleantext = re.sub(r'[?|.|!|*|@|#|\|,|)|(|\|/|′|]', r'', cleantext)
  # expanding contractions
  cleantext = cleantext.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")
  cleantext = cleantext.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")
  cleantext = cleantext.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")
  cleantext = cleantext.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")
  cleantext = cleantext.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")
  cleantext = cleantext.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")
  cleantext = cleantext.replace("€", " euro ").replace("'ll", " will").replace("&", "and")
  cleantext = re.sub(r"([0-9]+)000000", r"\1m", cleantext)
  cleantext = re.sub(r"([0-9]+)000", r"\1k", cleantext)


  # romove stopwords 
  
  words = word_tokenize(cleantext) 
  sentence_words = [word for word in words if word not in STOP_WORDS]
  final_sentence = " ".join(sentence_words)


  # stemming (not required for glove vector)
  '''
  #porter = PorterStemmer()
  words = word_tokenize(cleantext) 
  word_set = set()
  word_list = []
  for w in words:
    word_set.add(porter.stem(w))
  for x in word_set:
    word_list.append(x)

  str1 = " "
  final_str = str1.join(word_list)
  '''



  return cleantext


In [None]:
#################### Preprocess TEST ####################
print(preprocess("hi <h3> hello world's most precious 1000 island? 23$ "))

hi  hello world own most precious 1k island 23 dollar  


In [None]:
# if any null values are there , fill with blank string
nan_rows = data[data.isnull().any(1)]
print(nan_rows)
data = data.fillna('')
nan_rows = data[data.isnull().any(1)]
print(nan_rows)

            id    qid1    qid2  ... abs_len_diff mean_len  longest_substr_ratio
105780  105780  174363  174364  ...          0.0      0.0              0.965517
201841  201841  303951  174364  ...          0.0      0.0              0.965517
363362  363362  493340  493341  ...          0.0      0.0              0.965517

[3 rows x 28 columns]
Empty DataFrame
Columns: [id, qid1, qid2, question1, question2, is_duplicate, freq_qid1, freq_qid2, lenq1, lenq2, q1_n_words, q2_n_words, word_Common, word_Total, word_share, freq_q1+q2, freq_q1-q2, cwc_min, cwc_max, csc_min, csc_max, ctc_min, ctc_max, first_word_eq, last_word_eq, abs_len_diff, mean_len, longest_substr_ratio]
Index: []


In [None]:
################## Preprocess ##################

Q1 = data['question1'].values.tolist()
Q2 = data['question2'].values.tolist()

clean_Q1 = []
clean_Q2 = []

for i in tqdm(range(0,len(Q1))):
  Question1 = Q1[i]
  Question2 = Q2[i]
  
  
  clean_Q1.append(preprocess(Question1))
  clean_Q2.append(preprocess(Question2))



# adding them into the dataset 

data['Clean_Q1'] = clean_Q1
data['Clean_Q2'] = clean_Q2

#df = pd.DataFrame(list(zip(clean_Q1, clean_Q2)), columns =['Clean_Q1', 'clean_Q2']) 



100%|██████████| 404290/404290 [06:48<00:00, 990.90it/s]


In [None]:
# choose eqaul portion of positive and negative data and merege them together

positive_df = data[data['is_duplicate'] == 1 ].sample(n=1200, random_state=0)
negative_df = data[data['is_duplicate'] == 0 ].sample(n=1200, random_state=0)
final_df = pd.concat([positive_df, negative_df])



In [None]:
final_df.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
35115,35115,64212,64213,What minor would complement a Computer Science...,What is a good minor that will compliment a Co...,1,3,1,53,67,8,12,6,20,0.3,4,2,0.666656,0.666656,0.99995,0.399992,0.749991,0.499996,1.0,1.0,4.0,10.0,0.965517,what minor would complement a computer science...,what is a good minor that will compliment a co...


In [None]:
print("Total dataframe rows and coulmns: ",final_df.shape)
print("Total positive data: ",final_df[final_df['is_duplicate']== 1].shape)
print("Total negative data: ",final_df[final_df['is_duplicate']== 0].shape)

Total dataframe rows and coulmns:  (2400, 30)
Total positive data:  (1200, 30)
Total negative data:  (1200, 30)


In [None]:
final_df.drop(final_df.index[0], inplace=True)
final_df.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
327832,327832,454305,454306,How can I materialize an idea?,What is the best way to materialize on an idea?,1,1,1,30,47,6,10,3,16,0.1875,2,0,0.99995,0.499988,0.249994,0.166664,0.499992,0.299997,0.0,0.0,4.0,8.0,0.965517,how can i materialize an idea,what is the best way to materialize on an idea


In [None]:
y_ = final_df['is_duplicate']
final_df.drop(['question1'], axis=1, inplace=True)
final_df.drop(['question2'], axis=1, inplace=True)

In [None]:
final_df.head(1)

Unnamed: 0,id,qid1,qid2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
327832,327832,454305,454306,1,1,1,30,47,6,10,3,16,0.1875,2,0,0.99995,0.499988,0.249994,0.166664,0.499992,0.299997,0.0,0.0,4.0,8.0,0.965517,how can i materialize an idea,what is the best way to materialize on an idea


In [None]:
y_.head(1)

327832    1
Name: is_duplicate, dtype: int64

In [None]:
# train test split (80:20)
X_train, X_test, y_train, y_test = train_test_split(final_df, y_, test_size=0.20, random_state=0)


In [None]:
X_train.head(1)

Unnamed: 0,id,qid1,qid2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
152314,152314,239315,239316,0,1,1,45,34,7,6,5,13,0.384615,2,0,0.99995,0.666644,0.749981,0.749981,0.833319,0.714276,1.0,1.0,1.0,6.5,0.965517,what do foreigners not know about bangladesh,what do you know about bangladesh


In [None]:
X_test.head(1)

Unnamed: 0,id,qid1,qid2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
361950,361950,491826,491827,0,1,1,30,29,6,6,4,12,0.333333,2,0,0.999967,0.999967,0.666644,0.666644,0.833319,0.833319,1.0,1.0,0.0,6.0,0.965517,how can i stop using whatsapp,how do i stop using whatsapp


In [None]:
X_train_Q1 = list(X_train['Clean_Q1'])
X_train_Q2 = list(X_train['Clean_Q2'])

X_test_Q1 = list(X_test['Clean_Q1'])
X_test_Q2 = list(X_test['Clean_Q2'])


'\n# X_train_Q1 tfidf \ntfidf = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))\ntfidf.fit_transform(X_train_Q1)\n# dict key:word and value:tf-idf score\nX_train_Q1_word_tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n\n\n# X_train_Q2 tfidf \ntfidf = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))\ntfidf.fit_transform(X_train_Q2)\n# dict key:word and value:tf-idf score\nX_train_Q2_word_tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n\n\n# X_test_Q1 tfidf \ntfidf = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))\ntfidf.fit_transform(X_test_Q1)\n# dict key:word and value:tf-idf score\nX_test_Q1_word_tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n\n\n# X_test_Q2 tfidf \ntfidf = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))\ntfidf.fit_transform(X_test_Q2)\n# dict key:word and value:tf-idf score\nX_test_Q2_word_tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n\n\n'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


# merge texts
questions_X_train = X_train_Q1 + X_train_Q2
questions_X_test = X_test_Q1 + X_test_Q2

# Calculate tfidf value for each word

tfidf = TfidfVectorizer(lowercase=False )

tfidf.fit_transform(questions_X_train)
# dict key:word and value:tf-idf score
word2tfidf_X_train = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

tfidf.fit_transform(questions_X_test)
# dict key:word and value:tf-idf score
word2tfidf_X_test = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [None]:
# pre trained glove vector from spacy
!pip install spacy
import spacy




In [None]:
# en_vectors_web_lg, which includes over 1 million unique vectors.

def tfidf_glove(sentence, dataset_type):

  nlp = spacy.load('en_core_web_sm')
  # word document spacy 
  doc = nlp(sentence) 
  #dimensions of vectors 
  mean_vector = np.zeros([len(doc), len(doc[0].vector)])

  for word in doc:
    # word2vec
    vec1 = word.vector
    # fetch df score
    try:
      if dataset_type == 'X_train':
        tfidf = word2tfidf_X_train[str(word)]
      if dataset_type == 'X_test':
        tfidf = word2tfidf_X_test[str(word)]
    except:
      tfidf = 0
    # compute final vec
    mean_vector = mean_vector + vec1 * tfidf

  #mean
  mean_vector = mean_vector.mean(axis=0)

  return mean_vector


In [None]:
X_train_Q1 = list(X_train['Clean_Q1'])
X_train_Q2 = list(X_train['Clean_Q2'])

X_test_Q1 = list(X_test['Clean_Q1'])
X_test_Q2 = list(X_test['Clean_Q1'])

In [None]:
# calculating tfidf glove vector and storing them into list
X_train_Q1_GV = []
X_train_Q2_GV = []

X_test_Q1_GV = []
X_test_Q2_GV = []

for sentence in tqdm(X_train_Q1):
  X_train_Q1_GV.append(tfidf_glove(sentence, dataset_type='X_train'))

for sentence in tqdm(X_train_Q2):
  X_train_Q2_GV.append(tfidf_glove(sentence, dataset_type='X_train'))

for sentence in tqdm(X_test_Q1):
  X_test_Q1_GV.append(tfidf_glove(sentence, dataset_type='X_test'))

for sentence in tqdm(X_test_Q2):
  X_test_Q2_GV.append(tfidf_glove(sentence, dataset_type='X_test'))



100%|██████████| 1919/1919 [21:17<00:00,  1.50it/s]
100%|██████████| 1919/1919 [21:17<00:00,  1.50it/s]
100%|██████████| 480/480 [05:20<00:00,  1.50it/s]
100%|██████████| 480/480 [05:22<00:00,  1.49it/s]


In [None]:
X_train.head(1)

Unnamed: 0,id,qid1,qid2,is_duplicate,freq_qid1,freq_qid2,lenq1,lenq2,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,first_word_eq,last_word_eq,abs_len_diff,mean_len,longest_substr_ratio,Clean_Q1,Clean_Q2
152314,152314,239315,239316,0,1,1,45,34,7,6,5,13,0.384615,2,0,0.99995,0.666644,0.749981,0.749981,0.833319,0.714276,1.0,1.0,1.0,6.5,0.965517,what do foreigners not know about bangladesh,what do you know about bangladesh


In [None]:
# craeting new dataframes for storing glove vector array into df
df_train_q1 = pd.DataFrame()
df_train_q2 = pd.DataFrame()

df_test_q1 = pd.DataFrame()
df_test_q2 = pd.DataFrame()

In [None]:
# converting glove vector array into dataframe , ignore_index = True will add data rows wise for particular questions
df_train_q1 = df_train_q1.append(list(X_train_Q1_GV), ignore_index = True) 
df_train_q2 = df_train_q2.append(list(X_train_Q2_GV), ignore_index = True) 

df_test_q1 = df_test_q1.append(list(X_test_Q1_GV), ignore_index = True) 
df_test_q2 = df_test_q2.append(list(X_test_Q2_GV), ignore_index = True) 


In [None]:
df_test_q1.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,-2.918367,-3.684602,-1.603498,-42.853154,61.047675,-13.991431,43.737719,-11.197322,54.372677,69.097736,-61.293022,-15.45448,3.893846,32.784286,17.632483,-3.690841,-1.694163,14.201094,31.867109,-18.145949,9.35054,15.506617,-30.891881,-52.341356,20.815464,41.650556,-39.66979,-17.961132,31.496831,-11.394703,4.587732,-31.000003,-30.923754,-40.341683,31.402136,-13.133847,62.633632,-33.888489,-29.303905,18.362817,...,35.144471,17.720599,2.281302,40.045933,74.955003,27.386451,-28.739128,-47.428684,20.267393,-31.949732,42.790329,-17.335811,-15.36446,30.703888,-43.540112,52.695816,94.706953,29.049655,-24.867469,-9.65039,59.153114,-17.652552,2.867879,8.234669,-37.875611,18.299291,-28.694118,18.979125,1.663007,-16.76237,-4.200012,19.182655,-32.855464,-12.060145,-56.166467,-36.084533,-23.290735,68.357243,-14.138919,14.47285


In [None]:
df_train_q1.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,-2.918367,-3.684602,-1.603498,-42.853154,61.047675,-13.991431,43.737719,-11.197322,54.372677,69.097736,-61.293022,-15.45448,3.893846,32.784286,17.632483,-3.690841,-1.694163,14.201094,31.867109,-18.145949,9.35054,15.506617,-30.891881,-52.341356,20.815464,41.650556,-39.66979,-17.961132,31.496831,-11.394703,4.587732,-31.000003,-30.923754,-40.341683,31.402136,-13.133847,62.633632,-33.888489,-29.303905,18.362817,...,35.144471,17.720599,2.281302,40.045933,74.955003,27.386451,-28.739128,-47.428684,20.267393,-31.949732,42.790329,-17.335811,-15.36446,30.703888,-43.540112,52.695816,94.706953,29.049655,-24.867469,-9.65039,59.153114,-17.652552,2.867879,8.234669,-37.875611,18.299291,-28.694118,18.979125,1.663007,-16.76237,-4.200012,19.182655,-32.855464,-12.060145,-56.166467,-36.084533,-23.290735,68.357243,-14.138919,14.47285


In [None]:
# From X_train and X_test we need to extract id 
# We need to inject id into glove vector dataframes ( df_train_q1, df_train_q2, df_test_q1, df_test_q2)
# we need to merge df_train_q1, df_train_q2 according to id 
# we need to merge df_test_q1, df_test_q2 according to id 
# we need to merge X_train with train_
# we need to merge X_test with test_
# final 4 dataframes ready for ml model

X_train_id = X_train['id'].tolist()
X_test_id  = X_test['id'].tolist()

df_train_q1['id'] = X_train_id
df_train_q2['id'] = X_train_id

df_test_q1['id'] = X_test_id
df_test_q2['id'] = X_test_id

train_ = df_train_q1.merge(df_train_q2, on='id', how='left')
test_ = df_test_q1.merge(df_test_q2, on='id', how='left')

# adding id into train_ and test_ for another merge with X_train and X_test
train_['id'] = X_train_id
test_['id'] = X_test_id


# merge with main dataframe 
X_train = X_train.merge(train_, on='id', how='left')
X_test = X_test.merge(test_,on='id', how='left')

In [None]:
# saving the dataset into new dataset
X_train.to_csv("/content/drive/MyDrive/collab_data/Quora/X_train.csv", index=False)
X_test.to_csv("/content/drive/MyDrive/collab_data/Quora/X_test.csv", index=False)

y_train.to_csv("/content/drive/MyDrive/collab_data/Quora/y_train.csv", index=False)
y_test.to_csv("/content/drive/MyDrive/collab_data/Quora/y_test.csv", index=False)

print("X_train, X_test, y_train, y_test dataframe avilable for ml moldel !")

In [None]:
####################### test ####################

d1 = pd.DataFrame()
d2 = pd.DataFrame()

l1 = [[1,12,23],[90,99,98]]
l2= [[10,10,20],[90,90,90]]
idd = [1,2]
d1 = d1.append(l1, ignore_index = True)
d2 = d2.append(l2, ignore_index = True)

d1['id'] = idd
d2['id'] = idd

In [None]:
d1.head(2)

In [None]:
d2.head(2)

In [None]:
d3 = pd.concat([d1,d2], ignore_index=True)


In [None]:
d4 = d1.merge(d2, on='id', how='left')

In [None]:
d4.head()