In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams

eng_stopwords = set(stopwords.words('english'))

### Import Data

In [11]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

### Drop unneeded variables to set up stacking the data

In [12]:
df_train.drop(['id', 'qid1', 'qid2', 'is_duplicate'], axis=1, inplace=True)
df_test.drop(['test_id'], axis=1, inplace=True)

### Stack data so that all features created will be done on both train and test at the same time

In [14]:
all_data = pd.concat([df_train, df_test], keys=['train', 'test'])

In [16]:
all_data.head(2)

Unnamed: 0,Unnamed: 1,question1,question2
train,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
train,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...


In [17]:
all_data.tail(2)

Unnamed: 0,Unnamed: 1,question1,question2
test,2345794,What were the best and worst things about publ...,What are the best and worst things examination...
test,2345795,What is the best medication equation erectile ...,How do I out get rid of Erectile Dysfunction?


### Need to fill missing values, with blanks

In [26]:
all_data.fillna("", inplace=True)

In [43]:
del df_train, df_test

### These functions allow us to get unigrams to see how many words are in common among question 1 and 2

Methods for ngram features are from the following kaggle kernel https://www.kaggle.com/sudalairajkumar/quora-question-pairs/simple-exploration-notebook-quora-ques-pair

In [23]:
def get_unigrams(que):
    return [word for word in word_tokenize(que.lower()) if word not in eng_stopwords]

def get_common_unigrams(row):
    return len( set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])) )

def get_common_unigram_ratio(row):
    return float(row["unigrams_common_count"]) / max(len( 
        set(row["unigrams_ques1"]).union(set(row["unigrams_ques2"])) ),1)

In [28]:
all_data['unigrams_ques1'] = all_data.question1.apply(get_unigrams)
all_data['unigrams_ques2'] = all_data.question2.apply(get_unigrams)
all_data['unigrams_common_count'] = all_data.apply(lambda row: get_common_unigrams(row),axis=1)
all_data["unigrams_common_ratio"] = all_data.apply(lambda row: get_common_unigram_ratio(row), axis=1)

In [34]:
def get_bigrams(que):
    return [i for i in ngrams(que, 2)]

def get_common_bigrams(row):
    return len( set(row["bigrams_ques1"]).intersection(set(row["bigrams_ques2"])) )

def get_common_bigram_ratio(row):
    return float(row["bigrams_common_count"]) / max(len( 
        set(row["bigrams_ques1"]).union(set(row["bigrams_ques2"])) ),1)

In [35]:
all_data['bigrams_ques1'] = all_data.question1.apply(get_bigrams)
all_data['bigrams_ques2'] = all_data.question2.apply(get_bigrams)
all_data['bigrams_common_count'] = all_data.apply(lambda row: get_common_bigrams(row),axis=1)
all_data["bigrams_common_ratio"] = all_data.apply(lambda row: get_common_bigram_ratio(row), axis=1)

  


In [None]:
all_data.drop(['unigrams_ques1', 'unigrams_ques2', 'bigrams_ques1', 'bigrams_ques2'], axis=1, inplace=True)

In [41]:
all_data.ix['train'].to_csv('../processing/train_grams.csv', index=False)
all_data.ix['test'].to_csv('../processing/test_grams.csv', index=False)

#### Need to save and restart.  Ran out of memory

In [2]:
train = pd.read_csv('../processing/train_grams.csv')
test = pd.read_csv('../processing/test_grams.csv')

In [3]:
all_data = pd.concat([train, test], keys=['train', 'test'])

In [4]:
del train, test

In [5]:
def get_trigrams(que):
    return [i for i in ngrams(que, 3)]

def get_common_trigrams(row):
    return len( set(row["trigrams_ques1"]).intersection(set(row["trigrams_ques2"])) )

def get_common_trigram_ratio(row):
    return float(row["trigrams_common_count"]) / max(len( 
        set(row["trigrams_ques1"]).union(set(row["trigrams_ques2"])) ),1)

In [7]:
all_data['trigrams_ques1'] = all_data.question1.apply(lambda x: get_trigrams(str(x)))
all_data['trigrams_ques2'] = all_data.question2.apply(lambda x: get_trigrams(str(x)))
all_data['trigrams_common_count'] = all_data.apply(lambda row: get_common_trigrams(row),axis=1)
all_data["trigrams_common_ratio"] = all_data.apply(lambda row: get_common_trigram_ratio(row), axis=1)

  


In [8]:
all_data.ix['train'].to_csv('../processing/train_grams.csv', index=False)
all_data.ix['test'].to_csv('../processing/test_grams.csv', index=False)