In [1]:
import sklearn
from sklearn import svm, preprocessing, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel 
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.learning_curve import learning_curve
import random
import numpy as np
import gensim
import csv
import numpy as np
import pandas as pd
import timeit
import nltk
import itertools
import glove
from collections import defaultdict
import pandas as pd
from nltk.stem import *
from nltk import word_tokenize, ngrams
import re
from gensim import corpora
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#### Loading Stopwords

In [2]:
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pascalsitbon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Loading Training Set

In [3]:
Training_Set = pd.read_csv('train.csv').dropna()

In [4]:
print(Training_Set.shape)

(404288, 6)


In [5]:
print(Training_Set.iloc[10,:]['question1'])

Method to find separation of slits using fresnel biprism?


In [6]:
clean_data = Training_Set[['question1','question2','is_duplicate']].values

In [7]:
sentences_train = clean_data[:,:2]
labels = clean_data[:,2]

In [8]:
print('sample of data','\n',sentences_train[:10],labels[:10])

sample of data 
 [['What is the step by step guide to invest in share market in india?'
  'What is the step by step guide to invest in share market?']
 ['What is the story of Kohinoor (Koh-i-Noor) Diamond?'
  'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']
 [ 'How can I increase the speed of my internet connection while using a VPN?'
  'How can Internet speed be increased by hacking through DNS?']
 ['Why am I mentally very lonely? How can I solve it?'
  'Find the remainder when [math]23^{24}[/math] is divided by 24,23?']
 [ 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?'
  'Which fish would survive in salt water?']
 [ 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'
  "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]
 ['Should I buy tiago?'
  'What keeps childern active and far from phone and video games?']
 ['How can I be a good

In [211]:
size_train = 400000

## Word2vec featuring

In [212]:
def preprocess(sentence_train):
    sentences = []
    stemmer = PorterStemmer()
    for i in range(sentence_train.shape[0]): 
        source_sentence = sentences_train[i,0].lower().split(" ") 
        source_sentence = [token for token in source_sentence if token not in stpwds]
        unigrams_que1 = [stemmer.stem(token) for token in source_sentence]
        sentences.append(unigrams_que1)

        target_sentence = sentences_train[i,1].lower().split(" ") 
        target_sentence= [token for token in target_sentence if token not in stpwds]
        unigrams_que2 = [stemmer.stem(token) for token in target_sentence]
        sentences.append(unigrams_que2)
    return sentences

In [213]:
sentences = preprocess(sentences_train[:size_train])

In [214]:
model = gensim.models.Word2Vec(sentences, min_count=1,size=100,workers=4)
for i in range(100):
    model.train(sentences,total_examples=model.corpus_count,epochs=model.iter)

2017-05-15 22:41:13,536 : INFO : collecting all words and their counts
2017-05-15 22:41:13,537 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-15 22:41:13,572 : INFO : PROGRESS: at sentence #10000, processed 57070 words, keeping 13175 word types
2017-05-15 22:41:13,616 : INFO : PROGRESS: at sentence #20000, processed 113800 words, keeping 20410 word types
2017-05-15 22:41:13,672 : INFO : PROGRESS: at sentence #30000, processed 170684 words, keeping 26246 word types
2017-05-15 22:41:13,702 : INFO : PROGRESS: at sentence #40000, processed 227355 words, keeping 31320 word types
2017-05-15 22:41:13,728 : INFO : PROGRESS: at sentence #50000, processed 284506 words, keeping 36058 word types
2017-05-15 22:41:13,758 : INFO : PROGRESS: at sentence #60000, processed 341143 words, keeping 40245 word types
2017-05-15 22:41:13,790 : INFO : PROGRESS: at sentence #70000, processed 397752 words, keeping 44200 word types
2017-05-15 22:41:13,818 : INFO : PROGRESS: at s

In [215]:
def word2vec_features(model,sentences: np.array):
    'sentences :  [[token1,...,tokend],..,[token1,...,tokend]]'
    'model : Trained Word 2 Vec model'
    
    max_distance_tokens = []
    min_distance_tokens_duplic_removed = []
    centroid_distances = []


    for i in range(int(len(sentences)/2)):
        
        set1 = set(sentences[2*i])
        set2 = set(sentences[2*i+1])
        sym_dif = set1.symmetric_difference(set2)
        d_min = 100
        d_max = 0
        for token1 in set1&sym_dif:
            for token2 in set2&sym_dif:
                distance_tokens = np.linalg.norm(model[token1] - model[token2])
                if distance_tokens <= d_min:
                    d_min = distance_tokens
                if distance_tokens >= d_max:
                    d_max =distance_tokens
        
        
        if min(len(set1),len(set2))>0:
            centroid1 = np.sum([model[token1] for token1 in set1],axis=0)/len(set1)
            centroid2 = np.sum([model[token2] for token2 in set2],axis=0)/len(set2)
            distance_centroid = np.linalg.norm(centroid1-centroid2)
        else:
            distance_centroid = 100
    
        max_distance_tokens.append(d_max)
        min_distance_tokens_duplic_removed.append(d_min)
        centroid_distances.append(distance_centroid)
    
    
    word2vec_features = np.array([centroid_distances,
                                 min_distance_tokens_duplic_removed,
                                 max_distance_tokens]).T
                                            
    return word2vec_features

In [216]:
word2vec_features = word2vec_features(model,sentences)

In [217]:
print(sum(word2vec_features/word2vec_features.shape[0]))

[  5.45287886  22.44971824  18.08345711]


In [218]:
print(word2vec_features[:10,:],labels[:10])

[[  2.93500876  14.46680641  15.6884346 ]
 [  6.15852451  11.59368896  22.61016655]
 [  4.41799212   9.71231079  18.93532753]
 [ 11.52210903  10.28302288  28.35285568]
 [  7.43974447  10.41463375  21.26323891]
 [  3.57115579   1.57695639  23.0575695 ]
 [  9.53270149   2.34914875  22.75592613]
 [  5.01376677  10.02753448  10.02753448]
 [  1.77693188   3.18630266   4.37867069]
 [  5.70936871   1.92508137  23.66041946]] [0 0 0 0 0 1 0 1 0 0]


## Doc2Vec Featuring

In [219]:
# def doc2vecs_features(sentences_train,nb_epochs=100,alpha=0.025,min_alpha=0.025):
#     sentences = []
#     stemmer = PorterStemmer()
#     for i in range(sentences_train.shape[0]): 
#         source_sentence = sentences_train[i,0].lower().split(" ") 
#         source_sentence = [token for token in source_sentence if token not in stpwds]
#         unigrams_que1 = [stemmer.stem(token) for token in source_sentence]
#         sentences.append(unigrams_que1)

#         target_sentence = sentences_train[i,1].lower().split(" ") 
#         target_sentence= [token for token in target_sentence if token not in stpwds]
#         unigrams_que2 = [stemmer.stem(token) for token in target_sentence]
#         sentences.append(unigrams_que2)

#     texts=sentences.copy()
#     documents = []
#     ct = 0
#     for doc in texts:
#         doc = gensim.models.doc2vec.LabeledSentence(words = doc, tags = ['SENT_'+str(ct)])
#         ct+=1
#         documents.append(doc)
#     model = gensim.models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1,workers=4)
#     model.build_vocab(documents)

#     for epoch in range(100):
#         model.train(documents,total_examples=model.corpus_count,epochs=model.iter)
        
#     most_similar_is_duo_1_2 = []
#     most_similar_is_duo_2_1 = []
    
#     most_similar_score_if_duo_1_2 = []
#     most_similar_score_if_duo_2_1 = []
    
#     n_similarities = []
    
#     for i in range(sentences_train.shape[0]):
        
#         most_sim_1 = model.docvecs.most_similar(["SENT_"+str(2*i)])[0]
#         most_sim_2 = model.docvecs.most_similar(["SENT_"+str(2*i+1)])[0]
        
#         most_similar_is_duo_1_2.append(int(most_sim_1[0] == "SENT_"+str(2*i+1)))
#         most_similar_is_duo_2_1.append(int(most_sim_2[0] == "SENT_"+str(2*i)))
        
#         most_similar_score_if_duo_1_2.append(int(most_sim_1[0] == "SENT_"+str(2*i+1))*most_sim_1[1])
#         most_similar_score_if_duo_2_1.append(int(most_sim_2[0] == "SENT_"+str(2*i))*most_sim_2[1])
        
#         n_similarities.append(model.n_similarity(sentences[2*i], sentences[2*i+1]))
                                             
#         doc_2_vec_features = np.array([n_similarities,
#                                        most_similar_score_if_duo_1_2,
#                                        most_similar_score_if_duo_2_1,
#                                        most_similar_is_duo_1_2,
#                                        most_similar_is_duo_2_1])
                                            
#     return doc_2_vec_features
    

## Frequency and Hash Tag ID

In [221]:
# #frquency questions and hash.
# train_orig =  pd.read_csv('train.csv', header=0)
# test_orig =  pd.read_csv('test.csv', header=0)

# tic0=timeit.default_timer()
# df1 = train_orig[['question1']].copy()
# df2 = train_orig[['question2']].copy()
# df1_test = test_orig[['question1']].copy()
# df2_test = test_orig[['question2']].copy()

# df2.rename(columns = {'question2':'question1'},inplace=True)
# df2_test.rename(columns = {'question2':'question1'},inplace=True)

# train_questions = df1.append(df2)
# train_questions = train_questions.append(df1_test)
# train_questions = train_questions.append(df2_test)
# #train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
# train_questions.drop_duplicates(subset = ['question1'],inplace=True)

# train_questions.reset_index(inplace=True,drop=True)
# questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
# train_cp = train_orig.copy()
# test_cp = test_orig.copy()
# train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

# test_cp['is_duplicate'] = -1
# test_cp.rename(columns={'test_id':'id'},inplace=True)
# comb = pd.concat([train_cp,test_cp])

# comb['q1_hash'] = comb['question1'].map(questions_dict)
# comb['q2_hash'] = comb['question2'].map(questions_dict)

# q1_vc = comb.q1_hash.value_counts().to_dict()
# q2_vc = comb.q2_hash.value_counts().to_dict()

# def try_apply_dict(x,dict_to_apply):
#     try:
#         return dict_to_apply[x]
#     except KeyError:
#         return 0
# #map to frequency space
# comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
# comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

# train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
# test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]

In [222]:
def select_features(data_set,nb_ex):
    # Select basic features
    tfidf_cosin_sim = []     
    stemmer = PorterStemmer()
    dif_len = []
    common_unigrams_lens = []
    common_unigrams_ratios = []
    common_bigrams_lens = []
    common_bigrams_ratios = []
    common_trigrams_lens = []
    common_trigrams_ratios = []
    similarities = []
    counter = 0
    sentences = []
    for i in range(nb_ex):
        if i%10000== 0:
            print(i)
        source_sentence = data_set[i,0].lower().split(" ") 
        source_sentence = [token for token in source_sentence if token not in stpwds]
        unigrams_que1 = [stemmer.stem(token) for token in source_sentence]
        sentences.append(unigrams_que1)
        
        target_sentence = data_set[i,1].lower().split(" ") 
        target_sentence= [token for token in target_sentence if token not in stpwds]
        unigrams_que2 = [stemmer.stem(token) for token in target_sentence]
        sentences.append(unigrams_que2)

        
        #get unigram features #
        common_unigrams_len = len(set(unigrams_que1).intersection(set(unigrams_que2)))
        common_unigrams_lens.append(common_unigrams_len)
        common_unigrams_ratios.append(float(common_unigrams_len) / max(len(set(unigrams_que1).union(set(unigrams_que2))),1))
        
        # get bigram features #
        bigrams_que1 = [i for i in ngrams(unigrams_que1, 2)]
        bigrams_que2 = [i for i in ngrams(unigrams_que2, 2)]
        common_bigrams_len = len(set(bigrams_que1).intersection(set(bigrams_que2)))
        common_bigrams_lens.append(common_bigrams_len)
        common_bigrams_ratios.append(float(common_bigrams_len) / max(len(set(bigrams_que1).union(set(bigrams_que2))),1))


        # get trigram features #
        trigrams_que1 = [i for i in ngrams(unigrams_que1, 3)]
        trigrams_que2 = [i for i in ngrams(unigrams_que2, 3)]
        common_trigrams_len = len(set(trigrams_que1).intersection(set(trigrams_que2)))
        common_trigrams_lens.append(common_trigrams_len)
        common_trigrams_ratios.append(float(common_trigrams_len) / max(len(set(trigrams_que1).union(set(trigrams_que2))),1))

        dif_len.append(abs(len(source_sentence) - len(target_sentence)))
    
    features = np.array([common_unigrams_lens,
                         common_unigrams_ratios,
                         common_bigrams_lens,
                         common_bigrams_ratios,
                         common_trigrams_lens,
                         common_trigrams_ratios,
                         dif_len]).T
    
    
    return features

In [223]:
n_grams_features,y= select_features(sentences_train,size_train),labels[:size_train].astype(float)#,lsi,index

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000


In [224]:
print(n_grams_features.shape)

(400000, 7)


In [225]:
new_features = train_comb[['q1_hash','q2_hash','q1_freq','q2_freq']].iloc[:size_train].values

In [226]:
print(new_features.shape)

(400000, 4)


In [233]:
X = np.column_stack([n_grams_features,new_features,word2vec_features])
X = preprocessing.scale(X)



In [228]:
print(X.shape)

(400000, 14)


In [229]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                 test_size=0.3,
                                                 random_state=0)


In [None]:
clf = RandomForestClassifier(n_estimators=50,max_depth = 9,max_features=6)
clf.fit(X_train,y_train)
print('Training Score:',sklearn.metrics.log_loss(y_train,clf.predict_proba(X_train)[:,1]))
print('Testing Score:',sklearn.metrics.log_loss(y_test,clf.predict_proba(X_test)[:,1]))

In [186]:
clf.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
Testing_Set = pd.read_csv('test.csv')
print(Testing_Set.shape)
clean_data_test = Testing_Set[['question1','question2']].values


(2345796, 3)


In [None]:
basic_features = select_basic_features_test(clean_data_test)
new_features_test = test_comb[['q1_hash','q2_hash','q1_freq','q2_freq']].values

In [None]:
print(basic_features.shape,new_features_test.shape)
X_submission = np.column_stack([basic_features,new_features_test])
X_submission = preprocessing.scale(X_submission)
y_submission = clf.predict_proba(X_submission)[:,1]
Testing_Set['is_duplicate'] = y_submission

In [None]:
submission = Testing_Set[['test_id','is_duplicate']]

In [None]:
print(submission)

In [None]:
submission = submission.to_csv('/Users/pascalsitbon/work/Kaggle/pred.csv',index=False)