In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('questions.csv')

In [3]:
new_df = df.sample(2000,random_state=2)

In [4]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
339499,339499,665522,665523,Why was Cyrus Mistry removed as the Chairman o...,Why did the Tata Sons sacked Cyrus Mistry?,1
289521,289521,568878,568879,By what age would you think a man should be ma...,When my wrist is extended I feel a shock and b...,0
4665,4665,9325,9326,How would an arbitrageur seek to capitalize gi...,How would an arbitrageur seek to capitalize gi...,0
54203,54203,107861,107862,Why did Quora mark my question as incomplete?,Why does Quora detect my question as an incomp...,1
132566,132566,262554,91499,What is it like working with Pivotal Labs as a...,What's it like to work at Pivotal Labs?,0


In [5]:
import re
from bs4 import BeautifulSoup

In [6]:
def preprocess(q):
    q = str(q).lower().strip()

    #Replace special characters with strings
    q = q.replace('%',' percent ')
    q = q.replace('@',' at ')
    q = q.replace('$',' dollar ')
    q = q.replace('₹',' rupee ')

    #the pattern '[math]' appears very freq in whole dataset
    q = q.replace('[math]','')

    # replacing some numbers with their string equivalent
    q = q.replace(',000,000,000 ', 'B ')
    q = q.replace(',000,000 ', 'M ')
    q = q.replace(',000 ', 'K ')
    q = re.sub(r'([0-9]+)000000000', r'\1B', q)
    q = re.sub(r'([0-9]+)000000', r'\1M', q)
    q = re.sub(r'([0-9]+)000', r'\1K', q)

    # decontracting words 
    # reference : wikipedia,stack overflow
    expand = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "can not",
        "can't've": "can not have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have",
        "'ve": " have",
        "n't": " not",
        "'re": " are",
        "'ll": " will"
    }
    q_expand = []
    for word in q.split():
        if word in expand:
            word = expand[word]
        q_expand.append(word)
    q = ' '.join(q_expand)

    #Removing html tags
    q = BeautifulSoup(q)
    q = q.get_text()

    #removing punctuations
    p = re.compile('\W')
    q = re.sub(p,' ',q).strip()

    return q
    

In [7]:
preprocess("Hi!I can't buy @    ₹ 1000000<b> </b>")

'hi i can not buy at rupee 1M'

In [8]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)

In [9]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
339499,339499,665522,665523,why was cyrus mistry removed as the chairman o...,why did the tata sons sacked cyrus mistry,1
289521,289521,568878,568879,by what age would you think a man should be ma...,when my wrist is extended i feel a shock and b...,0
4665,4665,9325,9326,how would an arbitrageur seek to capitalize gi...,how would an arbitrageur seek to capitalize gi...,0
54203,54203,107861,107862,why did quora mark my question as incomplete,why does quora detect my question as an incomp...,1
132566,132566,262554,91499,what is it like working with pivotal labs as a...,what is it like to work at pivotal labs,0


In [10]:
new_df['q1_len'] = new_df['question1'].str.len()
new_df['q2_len'] = new_df['question2'].str.len()

In [11]:
new_df['q1_words'] = new_df['question1'].apply(lambda r : len(r.split(" ")))
new_df['q2_words'] = new_df['question2'].apply(lambda r : len(r.split(" ")))

In [12]:
def getCommonWords(row):
    w1 = set(map(lambda word : word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word : word.lower().strip(), row['question2'].split(" ")))
    w = (w1 & w2)
    return len(w)

In [13]:
new_df['common_words'] = new_df.apply(getCommonWords, axis=1)

In [14]:
def get_totalWords(row):
    w1 = set(map(lambda word : word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word : word.lower().strip(), row['question2'].split(" ")))
    l1 = len(w1)
    l2 = len(w2)
    return (l1+l2)

In [15]:
new_df['words_count'] = new_df.apply(get_totalWords, axis=1)


In [16]:
new_df['word_share'] = round(new_df['common_words']/new_df['words_count'],2)


In [17]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,words_count,word_share
339499,339499,665522,665523,why was cyrus mistry removed as the chairman o...,why did the tata sons sacked cyrus mistry,1,57,41,11,8,6,19,0.32
289521,289521,568878,568879,by what age would you think a man should be ma...,when my wrist is extended i feel a shock and b...,0,51,104,11,23,3,33,0.09
4665,4665,9325,9326,how would an arbitrageur seek to capitalize gi...,how would an arbitrageur seek to capitalize gi...,0,123,122,42,42,18,38,0.47
54203,54203,107861,107862,why did quora mark my question as incomplete,why does quora detect my question as an incomp...,1,44,59,8,10,6,18,0.33
132566,132566,262554,91499,what is it like working with pivotal labs as a...,what is it like to work at pivotal labs,0,53,39,11,9,6,20,0.3


In [18]:
# Advanced Features
    # 1. Token Features
    # 2. Length Features
    # 3. Fuzzy Features

In [19]:
# 1. token features

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords

def get_tokenFeatures(row):
    q1 = row['question1']
    q2 = row['question2']
    
    safe_div = 0.0001 #to avoid overflow
    stopWords = stopwords.words("english")
    tokenFeatures = [0.0]*8

    # Converting the sentences to tokens
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if(len(q1_tokens) == 0 or len(q2_tokens) == 0):
        return tokenFeatures

    # Stopwords
    q1_stops = set([word for word in q1_tokens if word in stopWords])
    q2_stops = set([word for word in q2_tokens if word in stopWords])
    # Non-stopwords
    q1_words = set([word for word in q1_tokens if word not in stopWords])
    q2_words = set([word for word in q2_tokens if word not in stopWords])

    # count of common tokens in ques pair
    common_tokens_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    # count of common non-stopwords in ques pair
    common_word_count = len(q1_words.intersection(q2_words))
    # count of common stopwords in ques pair
    common_stopwords_count = len(q1_stops.intersection(q2_stops))

    # 1. the ratio of the number of common words to the length of the smaller question
    tokenFeatures[0] = common_word_count / (min(len(q1_words),len(q2_words)) + safe_div)
    # 2. the ratio of the number of common words to the length of the larger question
    tokenFeatures[1] = common_word_count / (max(len(q1_words),len(q2_words)) + safe_div)
    # 3. the ratio of the number of common stop words to the smaller stop word count among the two questions
    tokenFeatures[2] = common_stopwords_count / (min(len(q1_stops),len(q2_stops)) + safe_div)
    # 4. the ratio of the number of common stop words to the larger stop word count among the two questions
    tokenFeatures[3] =  common_stopwords_count / (max(len(q1_stops),len(q2_stops)) + safe_div)
    # 5. the ratio of the number of common tokens to the smaller token count among the two questions
    tokenFeatures[4] = common_tokens_count / (min(len(q1_tokens),len(q2_tokens)) + safe_div)
    # 6. the ratio of the number of common tokens to the larger token count among the two questions
    tokenFeatures[5] = common_tokens_count / (max(len(q1_tokens),len(q2_tokens)) + safe_div)
    # 7. last word of both questions is same or not
    tokenFeatures[6] = int(q1_tokens[-1] == q2_tokens[-1])
    # 8. First word of both questions is same or not
    tokenFeatures[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return tokenFeatures


In [22]:
tokenFeatures = new_df.apply(get_tokenFeatures, axis=1)

new_df["cwc_min"] = list(map(lambda x: x[0], tokenFeatures))
new_df["cwc_max"] = list(map(lambda x: x[1], tokenFeatures))
new_df["csc_min"] = list(map(lambda x: x[2], tokenFeatures))
new_df["csc_max"] = list(map(lambda x: x[3], tokenFeatures))
new_df["ctc_min"] = list(map(lambda x: x[4], tokenFeatures))
new_df["ctc_max"] = list(map(lambda x: x[5], tokenFeatures))
new_df["last_word_eq"] = list(map(lambda x: x[6], tokenFeatures))
new_df["first_word_eq"] = list(map(lambda x: x[7], tokenFeatures))


In [23]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_words,q2_words,...,words_count,word_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
339499,339499,665522,665523,why was cyrus mistry removed as the chairman o...,why did the tata sons sacked cyrus mistry,1,57,41,11,8,...,19,0.32,0.799984,0.666656,0.666644,0.399992,0.749991,0.54545,0,1
289521,289521,568878,568879,by what age would you think a man should be ma...,when my wrist is extended i feel a shock and b...,0,51,104,11,23,...,33,0.09,0.0,0.0,0.499992,0.272725,0.272725,0.136363,0,0
4665,4665,9325,9326,how would an arbitrageur seek to capitalize gi...,how would an arbitrageur seek to capitalize gi...,0,123,122,42,42,...,38,0.47,0.92307,0.92307,0.99998,0.99998,0.70833,0.70833,1,1
54203,54203,107861,107862,why did quora mark my question as incomplete,why does quora detect my question as an incomp...,1,44,59,8,10,...,18,0.33,0.749981,0.599988,0.749981,0.599988,0.749991,0.599994,0,1
132566,132566,262554,91499,what is it like working with pivotal labs as a...,what is it like to work at pivotal labs,0,53,39,11,9,...,20,0.3,0.749981,0.599988,0.599988,0.499992,0.666659,0.54545,0,1


In [24]:
# 2. length features

In [25]:
pip install distance

Note: you may need to restart the kernel to use updated packages.


In [26]:
import distance

def get_lengthFeatures(row):
    q1 = row['question1']
    q2 = row['question2']
    lengthFeatures = [0.0]*3
    # converting to tokens
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    if len(q1_tokens)==0 or len(q2_tokens)==0 :
        return lengthFeatures
    # 1. Mean length : Mean of the length of the two questions
    lengthFeatures[0] = (len(q1_tokens)+len(q2_tokens))/2
    # 2. abs_len_diff: Absolute difference between the length of the two questions 
    lengthFeatures[1] = abs(len(q1_tokens) - len(q2_tokens))
    # 3. longest_substr_ratio: Ratio of the length of the longest substring among the two questions to the length of the smaller question
    strs = list(distance.lcsubstrings(q1,q2))
    lengthFeatures[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)

    return lengthFeatures

In [27]:
lengthFeatures = new_df.apply(get_lengthFeatures, axis=1)

new_df['mean_len'] = list(map(lambda x : x[0], lengthFeatures))
new_df['abs_len_diff'] = list(map(lambda x : x[1], lengthFeatures))
new_df['longest_substr_ratio'] = list(map(lambda x : x[2], lengthFeatures))

In [28]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_words,q2_words,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,mean_len,abs_len_diff,longest_substr_ratio
339499,339499,665522,665523,why was cyrus mistry removed as the chairman o...,why did the tata sons sacked cyrus mistry,1,57,41,11,8,...,0.666656,0.666644,0.399992,0.749991,0.54545,0,1,9.5,3,0.309524
289521,289521,568878,568879,by what age would you think a man should be ma...,when my wrist is extended i feel a shock and b...,0,51,104,11,23,...,0.0,0.499992,0.272725,0.272725,0.136363,0,0,16.5,11,0.115385
4665,4665,9325,9326,how would an arbitrageur seek to capitalize gi...,how would an arbitrageur seek to capitalize gi...,0,123,122,42,42,...,0.92307,0.99998,0.99998,0.70833,0.70833,1,1,24.0,0,0.593496
54203,54203,107861,107862,why did quora mark my question as incomplete,why does quora detect my question as an incomp...,1,44,59,8,10,...,0.599988,0.749981,0.599988,0.749991,0.599994,0,1,9.0,2,0.355556
132566,132566,262554,91499,what is it like working with pivotal labs as a...,what is it like to work at pivotal labs,0,53,39,11,9,...,0.599988,0.599988,0.499992,0.666659,0.54545,0,1,10.0,2,0.4


In [29]:
# 3. Fuzzy features

In [30]:
from fuzzywuzzy import fuzz

def get_fuzzyFeatures(row):
    q1 = row['question1']
    q2 = row['question2']
    fuzzyFeatures = [0.0]*4

    # 1. fuzz_ratio: fuzz_ratio score from fuzzywuzzy
    fuzzyFeatures[0] = fuzz.QRatio(q1,q2)
    # 2. fuzz_partial_ratio: fuzz_partial_ratio from fuzzywuzzy
    fuzzyFeatures[1] = fuzz.partial_ratio(q1,q2)
    # 3. token_sort_ratio: token_sort_ratio from fuzzywuzzy
    fuzzyFeatures[2] = fuzz.token_sort_ratio(q1,q2)
    # 4. token_set_ratio: token_set_ratio from fuzzywuzzy
    fuzzyFeatures[3] = fuzz.token_set_ratio(q1,q2)

    return fuzzyFeatures

In [31]:
fuzzyFeatures = new_df.apply(get_fuzzyFeatures,axis=1)
new_df['fuzz_ratio'] = list(map(lambda x : x[0], fuzzyFeatures))
new_df['fuzz_partial_ratio'] = list(map(lambda x : x[1], fuzzyFeatures))
new_df['fuzz_sort_ratio'] = list(map(lambda x : x[2], fuzzyFeatures))
new_df['token_set_ratio'] = list(map(lambda x : x[3], fuzzyFeatures))

In [32]:
new_df.shape

(2000, 28)

In [33]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_words,q2_words,...,ctc_max,last_word_eq,first_word_eq,mean_len,abs_len_diff,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,fuzz_sort_ratio,token_set_ratio
339499,339499,665522,665523,why was cyrus mistry removed as the chairman o...,why did the tata sons sacked cyrus mistry,1,57,41,11,8,...,0.54545,0,1,9.5,3,0.309524,39,46,67,85
289521,289521,568878,568879,by what age would you think a man should be ma...,when my wrist is extended i feel a shock and b...,0,51,104,11,23,...,0.136363,0,0,16.5,11,0.115385,26,50,35,37
4665,4665,9325,9326,how would an arbitrageur seek to capitalize gi...,how would an arbitrageur seek to capitalize gi...,0,123,122,42,42,...,0.70833,1,1,24.0,0,0.593496,100,99,100,99
54203,54203,107861,107862,why did quora mark my question as incomplete,why does quora detect my question as an incomp...,1,44,59,8,10,...,0.599994,0,1,9.0,2,0.355556,74,75,74,89
132566,132566,262554,91499,what is it like working with pivotal labs as a...,what is it like to work at pivotal labs,0,53,39,11,9,...,0.54545,0,1,10.0,2,0.4,76,85,78,84


In [34]:
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2','is_duplicate'])
final_df.shape

(2000, 22)

In [35]:
final_df.head()

Unnamed: 0,q1_len,q2_len,q1_words,q2_words,common_words,words_count,word_share,cwc_min,cwc_max,csc_min,...,ctc_max,last_word_eq,first_word_eq,mean_len,abs_len_diff,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,fuzz_sort_ratio,token_set_ratio
339499,57,41,11,8,6,19,0.32,0.799984,0.666656,0.666644,...,0.54545,0,1,9.5,3,0.309524,39,46,67,85
289521,51,104,11,23,3,33,0.09,0.0,0.0,0.499992,...,0.136363,0,0,16.5,11,0.115385,26,50,35,37
4665,123,122,42,42,18,38,0.47,0.92307,0.92307,0.99998,...,0.70833,1,1,24.0,0,0.593496,100,99,100,99
54203,44,59,8,10,6,18,0.33,0.749981,0.599988,0.749981,...,0.599994,0,1,9.0,2,0.355556,74,75,74,89
132566,53,39,11,9,6,20,0.3,0.749981,0.599988,0.599988,...,0.54545,0,1,10.0,2,0.4,76,85,78,84


In [36]:
ques_df = new_df[['question1','question2']]

In [37]:
# from sklearn.feature_extraction.text import CountVectorizer
# questions = list(ques_df['question1']) + list(ques_df['question2'])
# cv = CountVectorizer(max_features = 3000)
# q1_arr,q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [38]:
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec

# # GloVe Wikipedia Embeddings 100
# glove_model = api.load("glove-wiki-gigaword-100")

# # GloVe Twitter Embeddings 100
# glove_model = api.load("glove-twitter-100")

# # Glove Wikipedia embeddings 200
# glove_model = api.load("glove-wiki-gigaword-200")

# GloVe Twitter Embeddings 200
glove_model = api.load("glove-twitter-200")

# print(api.info())

In [39]:
questions = list(ques_df['question1']) + list(ques_df['question2'])
def get_sentence_embedding(sentence):
    embedding = []
    for word in sentence.split():
        if word in glove_model:
            embedding.append(glove_model[word])
    if embedding:
        return np.mean(embedding, axis=0)
    else:
        return np.zeros_like(glove_model.vectors[0])

In [40]:
q1_arr,q2_arr = np.vsplit(np.array([get_sentence_embedding(question) for question in questions]),2)
temp_df1 = pd.DataFrame(q1_arr, index = ques_df.index) # data frame having embeddings of questions from question1
temp_df2 = pd.DataFrame(q2_arr, index = ques_df.index) # data frame having embeddings of questions from question2
temp_df = pd.concat([temp_df1,temp_df2], axis = 1)

In [41]:
temp_df.shape

(2000, 400)

In [42]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
339499,0.073014,0.083315,-0.006533,0.305487,-0.075426,-0.113423,0.189117,0.031728,-0.041542,-0.100417,...,-0.177078,0.022114,0.222336,-0.09725,-0.330516,-0.270414,0.187005,0.185632,-0.364013,-0.316517
289521,-0.106246,0.198895,-0.080023,0.227546,-0.292415,-0.013808,0.649326,0.076051,-0.038561,0.003441,...,-0.135197,0.046088,0.050507,-0.106823,0.094523,-0.151798,0.111102,0.096314,-0.088574,-0.010825
4665,0.157114,0.239691,0.099414,0.092251,-0.116887,0.10081,0.0946,0.028049,0.077479,0.144074,...,-0.149707,0.085078,-0.086425,0.046597,0.050264,-0.094777,-0.058099,0.018301,-0.036042,-0.041587
54203,0.329288,-0.01847,0.079673,0.01435,-0.142538,0.252554,0.463266,0.104258,0.03032,-0.189261,...,0.031404,-0.0029,0.067951,-0.058053,-0.009264,-0.229822,0.128677,0.079402,-0.064304,-0.313593
132566,0.260448,0.123872,-0.02497,0.227371,-0.175497,0.105724,0.320817,-0.232515,-0.200863,-0.284202,...,0.010289,0.033269,0.249364,-0.166511,-0.185929,-0.046684,0.339473,0.206904,-0.177414,0.34857


In [43]:
#ONLY EMBEDDINGS

# temp_df.to_csv('wiki_100.csv')
# temp_df.to_csv('twitter_100.csv')
# temp_df.to_csv('wiki_300.csv')
# temp_df.to_csv('twitter_200.csv')

In [44]:
# # new part --> EMBEDDINGS + isDuplicate
# newtemp_df = temp_df.copy()
# newtemp_df['is_duplicate'] = final_df['is_duplicate']

# newtemp_df.to_csv('new_wiki100.csv',index=False) 
# # newtemp_df.to_csv('new_twitter100.csv',index=False) 
# # newtemp_df.to_csv('new_wiki200.csv',index=False) 
# # newtemp_df.to_csv('new_twitter200.csv',index=False) 

In [45]:
# FEATURE SELECTION

In [46]:
# Ranksums test approach

In [47]:
from scipy.stats import ranksums

def fs_ranksums(fname,new_fname):
    data = np.genfromtxt(fname,delimiter=',')
    #initialise an array to store p-value
    p = np.zeros(np.shape(data)[1])

    in1 = np.where(data[:,-1] == 0)
    in2 = np.where(data[:,-1] == 1)

    for j in range(np.shape(data)[1]-1):
        p1 = ranksums(data[in1[0],j], data[in2[0],j])
        p[j] = p1.pvalue

    in1 = np.where(p<=0.05)
    md = (np.shape(data)[1]-1)/2
    md=int(md)
    in2=np.where(p[0:md]<=0.05)
    pos=len(in2[0])
    datan = data[:, in1[0]]
    np.savetxt(new_fname,datan,delimiter=',')
    return pos

In [48]:
# p1 = fs_ranksums('new_wiki100.csv','new_ranksums_wiki100.csv')
# # fs_ranksums('new_twitter100.csv','new_ranksums_twitter100.csv')
# # fs_ranksums('new_wiki200.csv','new_ranksums_wiki200.csv')
# # fs_ranksums('new_twitter200.csv','new_ranksums_twitter200.csv')


In [49]:
# print(p1)

In [50]:
# fs_ranksums('new_tfidf.csv','ranksums_tfidf.csv')

In [51]:
final_df = pd.concat([final_df,temp_df],axis=1)

In [52]:
final_df.shape

(2000, 422)

In [53]:
final_df['is_duplicate'] = new_df['is_duplicate']

In [54]:
final_df.head()

Unnamed: 0,q1_len,q2_len,q1_words,q2_words,common_words,words_count,word_share,cwc_min,cwc_max,csc_min,...,191,192,193,194,195,196,197,198,199,is_duplicate
339499,57,41,11,8,6,19,0.32,0.799984,0.666656,0.666644,...,0.022114,0.222336,-0.09725,-0.330516,-0.270414,0.187005,0.185632,-0.364013,-0.316517,1
289521,51,104,11,23,3,33,0.09,0.0,0.0,0.499992,...,0.046088,0.050507,-0.106823,0.094523,-0.151798,0.111102,0.096314,-0.088574,-0.010825,0
4665,123,122,42,42,18,38,0.47,0.92307,0.92307,0.99998,...,0.085078,-0.086425,0.046597,0.050264,-0.094777,-0.058099,0.018301,-0.036042,-0.041587,0
54203,44,59,8,10,6,18,0.33,0.749981,0.599988,0.749981,...,-0.0029,0.067951,-0.058053,-0.009264,-0.229822,0.128677,0.079402,-0.064304,-0.313593,1
132566,53,39,11,9,6,20,0.3,0.749981,0.599988,0.599988,...,0.033269,0.249364,-0.166511,-0.185929,-0.046684,0.339473,0.206904,-0.177414,0.34857,0


In [55]:
# NEW (taking features+glove embeddings and using ranksums) - 23/5/24

In [56]:
# final_df.to_csv('02_new_wiki100.csv',index=False)
# final_df.to_csv('02_new_wiki200.csv',index=False)
# final_df.to_csv('02_new_twitter100.csv',index=False)
final_df.to_csv('02_new_twitter200.csv',index=False)

In [58]:
# p11 = fs_ranksums('02_new_wiki100.csv','02_new_ranksums_wiki100.csv')
# print(p11);

# p12 = fs_ranksums('02_new_wiki200.csv','02_new_ranksums_wiki200.csv')
# print(p12);

# p13 = fs_ranksums('02_new_twitter100.csv','02_new_ranksums_twitter100.csv')
# print(p13);

# p14 = fs_ranksums('02_new_twitter200.csv','02_new_ranksums_twitter200.csv')
# print(p14);

p15 = fs_ranksums('02_new_tfidf.csv','02_new_ranksums_tfidf.csv')
print(p15);

24


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(final_df.iloc[:,1:].values,final_df.iloc[:,0].values,test_size=0.2,random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)