In [1]:
import numpy as np
import pandas as pd
import codecs
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from igraph import Graph
import pandas as pd

In [2]:
from gensim.models.word2vec import Word2Vec



In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, load_model, Sequential
from keras import backend as K
from keras.layers import Input, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.layers import GlobalAveragePooling1D, Lambda
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using Theano backend.


In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk import word_tokenize, pos_tag
from nltk import ngrams

In [4]:
# fix random seed for reproducibility
np.random.seed(7)

In [5]:
texts = {}
pairs_train = []
pairs_test = []
y_train = []
with codecs.open('train.csv','r', 'UTF-8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4]
        pairs_train.append([l[1],l[2]])
        y_train.append(int(l[5][:-1])) # [:-1] is just to remove formatting at the end

with codecs.open('test.csv','r', encoding='UTF-8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4][:-1]
        pairs_test.append([l[1], l[2]])

In [6]:
len(texts)  #type(texts)

58940

### Preprocess data: clean data ...

In [7]:
docs = texts.values()
# prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
encoded_docs = tokenizer.texts_to_sequences(docs)
print docs[0]
print encoded_docs[0]
#tokenizer.word_index

# pad documents to the same length
max_size = max([len(t) for t in encoded_docs]) # maximum document size allowed
padded_docs = pad_sequences(encoded_docs, maxlen=max_size, padding='post')
print (max_size, 'max_size')
print padded_docs[0]

"Getting Started on Quora: What is Quora?"
[315, 643, 19, 38, 2, 3, 38]
(73, 'max_size')
[315 643  19  38   2   3  38   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]


In [8]:
max([len(t) for t in docs])

337

In [9]:
max([len(t) for t in encoded_docs])

73

In [10]:
index_to_word = dict((v,k) for k, v in tokenizer.word_index.items())
# stpwds = [index_to_word[idx] for idx in range(1,stpwd_thd)]
# print('stopwords are:',stpwds)
x_full_words = [[index_to_word[idx] for idx in rev if idx!=0] for rev in encoded_docs]
all_words = [word for rev in x_full_words for word in rev]
print x_full_words[0]
print (len(all_words),'words')
print (len(list(set(all_words))),'unique words')
print (len(tokenizer.word_index), 'tokenizer.word_index')

[u'getting', u'started', u'on', u'quora', u'what', u'is', u'quora']
(599985, 'words')
(20353, 'unique words')
(20353, 'tokenizer.word_index')


In [11]:
wordsFreq = Counter(all_words)   #print (wordsFreq)    #wordsFreq['the']
len(wordsFreq)

20353

In [12]:
# texts_token is used for model fit
texts_token = dict((ID,int(i)) for i,(ID,s) in enumerate(texts.items()))
#padded_docs[texts_token[ID]]

In [13]:
#Word2Vec embedding
#Use pre-trained
word_vector_dim = int(3e2)
print (word_vector_dim, 'word_vector_dim')
word_vectors = Word2Vec(size=word_vector_dim, min_count=1)
#create entries for the words in our vocabulary
word_vectors.build_vocab(x_full_words)
#sanity check
##assert(len(list(set(all_words))) == len(word_vectors.wv.vocab))
word_vectors.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

(300, 'word_vector_dim')


In [14]:
type(word_vectors)

gensim.models.word2vec.Word2Vec

import json
json.dump(word_vectors, open("word_vectors.txt",'w'))
#word_vectors = json.load(open("word_vectors.txt"))

In [15]:
norms = [np.linalg.norm(word_vectors[word]) for word in list(word_vectors.wv.vocab)] # in Python 2.7: word_vectors.wv.vocab.keys()
idxs_zero_norms = [idx for idx, norm in enumerate(norms) if norm<0.05]
no_entry_words = [list(word_vectors.wv.vocab)[idx] for idx in idxs_zero_norms]
print('# of vocab words w/o a Google News entry:',len(no_entry_words))

('# of vocab words w/o a Google News entry:', 5351)


In [16]:
print (len(list(set(all_words))),'unique words')
print (len(word_vectors.wv.vocab), 'word_vectors.wv.vocab')
print (len(tokenizer.word_index), 'tokenizer.word_index')

(20353, 'unique words')
(20353, 'word_vectors.wv.vocab')
(20353, 'tokenizer.word_index')


In [17]:
# create numpy array of embeddings 
max_features = len(tokenizer.word_index)+1  # nb of unique words
print (max_features, 'max_features')
embeddings = np.zeros((max_features, word_vector_dim))
for word, idx in tokenizer.word_index.items():
    embeddings[idx,] = word_vectors[word]
print('embeddings created')

(20354, 'max_features')
embeddings created


In [20]:
# Output embedding_matrix
np.save('embedding_matrix.npy', embeddings)
check = np.load('embedding_matrix.npy')
check == embeddings

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)

In [13]:
#---load embedding_matrix---#
embeddings = np.load('embedding_matrix.npy')

### Prepare training and test data

In [14]:
N_train = len(pairs_train)  ####padded_docs[texts_token[ID]]
X_train1 = []
X_train2 = []
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train1 = X_train1 + [padded_docs[texts_token[q1]]]
    X_train2 = X_train2 + [padded_docs[texts_token[q2]]]

N_test = len(pairs_test)
X_test1 = []
X_test2 = []
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test1 = X_test1 + [padded_docs[texts_token[q1]]]
    X_test2 = X_test2 + [padded_docs[texts_token[q2]]]

X_train1 = np.array(X_train1)
X_train2 = np.array(X_train2)
X_test1 = np.array(X_test1)
X_test2 = np.array(X_test2)

In [15]:
print X_train1.shape, X_train2.shape, max_size
print X_test1.shape, X_test2.shape

(80100, 73) (80100, 73) 73
(20179, 73) (20179, 73)


In [16]:
np.array([X_test1, X_test2]).shape

(2, 20179, 73)

### Basic features

In [187]:
texts_rebuilt = {}
for id, words in texts.items():
    s = x_full_words[texts_token[id]]
    rebuilt = ''
    for t in s:
        rebuilt = rebuilt + t + ' '
    texts_rebuilt[id] = rebuilt

In [188]:
ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 
for qid in texts_rebuilt:
    ids2ind[qid] = len(ids2ind)

vec = TfidfVectorizer()
A = vec.fit_transform(texts_rebuilt.values())

basicF_train = np.zeros((N_train, 3))
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    basicF_train[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    basicF_train[i,1] = len(texts_rebuilt[q1].split()) + len(texts_rebuilt[q2].split())
    basicF_train[i,2] = abs(len(texts_rebuilt[q1].split()) - len(texts_rebuilt[q2].split()))

basicF_test = np.zeros((N_test, 3))
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    basicF_test[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    basicF_test[i,1] = len(texts_rebuilt[q1].split()) + len(texts_rebuilt[q2].split())
    basicF_test[i,2] = abs(len(texts_rebuilt[q1].split()) - len(texts_rebuilt[q2].split()))

In [225]:
basicF_train

array([[  0.58723412,  22.        ,   6.        ],
       [  0.44438048,  21.        ,   3.        ],
       [  0.47480977,  26.        ,   6.        ],
       ..., 
       [  0.62334832,  13.        ,   1.        ],
       [  0.71035487,  38.        ,   2.        ],
       [  0.6320477 ,  18.        ,   0.        ]])

In [189]:
# Save the basic features
with open("basic_features_train.csv", 'w') as f:
    f.write("tfidfCOS, lenSUM, lenDIFF\n")
    for i in range(N_train):
        f.write(str(basicF_train[i,0])+','+str(basicF_train[i,1])+','+str(basicF_train[i,2])+'\n')

with open("basic_features_test.csv", 'w') as f:
    f.write("tfidfCOS, lenSUM, lenDIFF\n")
    for i in range(N_test):
        f.write(str(basicF_test[i,0])+','+str(basicF_test[i,1])+','+str(basicF_test[i,2])+'\n')

In [17]:
#---load basic features---#
basicF_train = pd.read_csv('basic_features_train.csv')
basicF_train = basicF_train.values
basicF_test = pd.read_csv('basic_features_test.csv')
basicF_test = basicF_test.values
print basicF_train.shape, basicF_test.shape

(80100, 3) (20179, 3)


### Magic feature and Graph feature

In [29]:
import operator
import itertools
import networkx as nx

In [30]:
def terms2graph(terms, w, vocab, d):
    '''This function use 
       returns an undirected, unweighted graph 
    '''    
    #(1) Terms to graph
    terms = [term for term in terms if term]   #remove the padding zero
        
    v_nb = len(vocab)
    A = np.zeros((v_nb,v_nb))  #adjacency matrix
    D = np.zeros((v_nb,v_nb))
    for term in terms:
        D[vocab[term],vocab[term]] = 1
    # create initial complete graph (first w terms)
    
    terms_temp = terms[0:w]
    
    if len(terms_temp)>=2:
        indexes = list(itertools.combinations(range(w), r=2))
    else:
        indexes = []
    
    # add edges
    edges = []
    for my_tuple in indexes:
        ii = vocab[terms_temp[my_tuple[0]]]    # index of vertice
        jj = vocab[terms_temp[my_tuple[1]]]
        if ii != jj:
            A[ii, jj] = 1
            A[jj, ii] = 1
            edges.append((ii,jj))
    # then iterate over the remaining terms
    for i in range(w, len(terms)):
        considered_term = terms[i] # term to consider
        terms_temp = terms[(i-w+1):(i+1)] # all terms within sliding window
        
        # edges to try
        candidate_edges = []
        for p in range(w-1):
            candidate_edges.append((terms_temp[p],considered_term))
        
        for try_edge in candidate_edges:          
            if try_edge[1] != try_edge[0]:
            # if not self-edge
                ii = vocab[try_edge[0]]
                jj = vocab[try_edge[1]]
                A[ii, jj] = 1
                A[jj, ii] = 1
                edges.append((ii,jj))
    
    #print A, np.sum(A)
    #print D
    
    # create empty graph
    g = nx.Graph()
    # add vertices
    g.add_nodes_from(sorted(vocab.values()))
    g.add_edges_from(edges)
    
    #(2) Convert the graph using shortest_path_length
    AA = np.zeros((v_nb,v_nb))
    path = nx.shortest_path_length(g)
    
    for v1 in range(len(path)):
        if len(path[v1])>1:   #len(path[0])=1 means this vertice is not in this graph
            path_sorted = sorted(path[v1].items(), key=operator.itemgetter(1)) # dictionary sorted by value
            for v2, l in path_sorted[1:]:
                if l <= d:
                    AA[v1,v2] = 1.0/l
    
    M = AA + D
    norm = np.linalg.norm(M)                  
    #print AA    
    return(g, AA, norm)

def graph_kernel(x1, x2, A1, A2, norm):
    k_node = len(set(x1).intersection(set(x2)).difference({0}))
    k_walk = np.sum(A1 * A2)  
    #print k_node, k_walk
    k = (k_node + k_walk)/norm    
    return k

In [42]:
graph_similarity_train = []
for i in range(N_train):
    # vocabulary for X_train1[i] and X_train2[i]
    vocab_per_train = sorted(list(set(X_train1[i]).union(set(X_train2[i])).difference({0})))
    vocab_per_train = dict((word,index) for index,word in enumerate(vocab_per_train))
    w = 2  #window
    d = 3  #walk length
    g1,A1,norm1 = terms2graph(X_train1[i], w, vocab_per_train, d)
    g2,A2,norm2 = terms2graph(X_train2[i], w, vocab_per_train, d)
    norm = norm1 * norm2
    graph_similarity_train.append(graph_kernel(X_train1[i],X_train2[i],A1,A2,norm))
graph_similarity_train = np.array(graph_similarity_train).reshape(-1,1)

graph_similarity_test = []
for i in range(N_test):
    # vocabulary for X_test1[i] and X_test2[i]
    vocab_per_test = sorted(list(set(X_test1[i]).union(set(X_test2[i])).difference({0})))
    vocab_per_test = dict((word,index) for index,word in enumerate(vocab_per_test))
    w = 2  #window
    d = 3  #walk length
    g1,A1,norm1 = terms2graph(X_test1[i], w, vocab_per_test, d)
    g2,A2,norm2 = terms2graph(X_test2[i], w, vocab_per_test, d)
    norm = norm1 * norm2
    graph_similarity_test.append(graph_kernel(X_test1[i],X_test2[i],A1,A2,norm))
graph_similarity_test = np.array(graph_similarity_test).reshape(-1,1)

In [43]:
###??? deal with NaN data
graph_similarity_train = np.nan_to_num(graph_similarity_train)
graph_similarity_test = np.nan_to_num(graph_similarity_test)

In [44]:
graph_similarity_train.shape, graph_similarity_test.shape

((80100, 1), (20179, 1))

In [45]:
np.min(graph_similarity_train), np.max(graph_similarity_train)

(0.0, 1.0000000000000004)

In [35]:
# d=1
graph_similarity_test

array([[ 0.26013299],
       [ 0.76271277],
       [ 0.72488244],
       ..., 
       [ 0.81016272],
       [ 0.09058216],
       [ 0.52574973]])

In [223]:
# d=2
graph_similarity_test

array([[ 0.24781018],
       [ 0.8046798 ],
       [ 0.74926865],
       ..., 
       [ 0.8054638 ],
       [ 0.07470439],
       [ 0.50619689]])

In [46]:
# d=3
graph_similarity_test

array([[ 0.24525303],
       [ 0.81160375],
       [ 0.75241203],
       ..., 
       [ 0.80217473],
       [ 0.06880674],
       [ 0.50235602]])

In [47]:
# Save the graph features
with open("graph_features_train.csv", 'w') as f:
    f.write("graphKernel\n")
    for i in range(N_train):
        f.write(str(graph_similarity_train[i,0])+'\n')

with open("graph_features_test.csv", 'w') as f:
    f.write("graphKernel\n")
    for i in range(N_test):
        f.write(str(graph_similarity_test[i,0])+'\n')

In [48]:
#---load graph features---#
graph_similarity_train = pd.read_csv('graph_features_train.csv')
graph_similarity_train = graph_similarity_train.values
graph_similarity_test = pd.read_csv('graph_features_test.csv')
graph_similarity_test = graph_similarity_test.values

In [49]:
graph_similarity_test

array([[ 0.26013299,  0.24781018,  0.24525303],
       [ 0.76271277,  0.8046798 ,  0.81160376],
       [ 0.72488244,  0.74926865,  0.75241203],
       ..., 
       [ 0.81016272,  0.8054638 ,  0.80217472],
       [ 0.09058216,  0.07470439,  0.06880674],
       [ 0.52574973,  0.50619689,  0.50235602]])

### Scikit learn ensemble methods

In [233]:
Counter(y_train)

Counter({0: 28161, 1: 51939})

In [50]:
# Use graph feature to predict
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss

trainSize = int(N_train * 0.9)
clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1)
#clf = AdaBoostClassifier(n_estimators=500)
#clf = svm.SVC(probability=True)
#clf = MLPClassifier()

#features_train = basicF_train
#features_test = basicF_test
features_train = np.concatenate((basicF_train, graph_similarity_train), axis=1)
features_test = np.concatenate((basicF_test, graph_similarity_test), axis=1)

clf.fit(features_train[:trainSize,:], y_train[:trainSize])
y_pred = clf.predict_proba(features_train[trainSize:,:])
#y_true = [0, 0, 1, 1]
#y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
y_true = y_train[trainSize:]
log_loss(y_true, y_pred)

0.51418056795360845

In [None]:
0.51508038460919536

In [None]:
# Predict test data, then output to a CSV file
y_pred_test = clf.predict_proba(features_test)

In [None]:
y_pred_test

In [None]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred_test[i][1])+'\n')   #y_pred[i][0]

In [242]:
range(5,10)

[5, 6, 7, 8, 9]

In [243]:
for i in range(10,20):
    clf = RandomForestClassifier(n_estimators=500, max_depth=i, n_jobs=-1)
    clf.fit(features_train[:trainSize,:], y_train[:trainSize])
    y_pred = clf.predict_proba(features_train[trainSize:,:])
    y_true = y_train[trainSize:]
    print i, ', loss', log_loss(y_true, y_pred)

10 , loss 0.514883539309
11 , loss 0.513655911954
12 , loss 0.512710427586
13 , loss 0.513062037218


KeyboardInterrupt: 

In [244]:
from sklearn.ensemble import VotingClassifier
# create the sub models
estimators = []
#model1 = KNeighborsClassifier(n_neighbors=8)
model1 = GradientBoostingClassifier(n_estimators=100,random_state=0)
estimators.append(('GradientBoosting', model1))
#extra = ExtraTreesClassifier(n_estimators=35, random_state=1)
#model2 = AdaBoostClassifier(base_estimator=extra, n_estimators=100, random_state=0)
model2 = MLPClassifier()
estimators.append(('MLP', model2))
model3 = RandomForestClassifier(n_estimators = 500, max_depth=12, random_state=0)
estimators.append(('RandomForest', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators,voting='soft')
###ensemble = ensemble.fit(X_train, y_train)

print('---------------VotingClassifier---------------')
      
for clf, label in zip([model1, model2, model3, ensemble], 
                      ['GradientBoosting', 
                       'MLP', 
                       'RandomForest',
                       'VotingClassifier']):

    #scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    #print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))
    clf = clf.fit(features_train[:trainSize,:], y_train[:trainSize])
    
    y_pred = clf.predict_proba(features_train[:trainSize,:])
    y_true = y_train[:trainSize]
    loss = log_loss(y_true, y_pred)
    print("\n[%s] Training: %0.4f" % (label, loss))

    y_pred = clf.predict_proba(features_train[trainSize:,:])
    y_true = y_train[trainSize:]
    loss = log_loss(y_true, y_pred)
    print("[%s] Test:     %0.4f " % (label, loss))

clf = ensemble

---------------VotingClassifier---------------

[GradientBoosting] Training: 0.5212
[GradientBoosting] Test:     0.5246 

[MLP] Training: 0.5373
[MLP] Test:     0.5398 

[RandomForest] Training: 0.4762
[RandomForest] Test:     0.5128 

[VotingClassifier] Training: 0.5060
[VotingClassifier] Test:     0.5206 


In [249]:
from mlxtend.classifier import StackingClassifier
clf1 = GradientBoostingClassifier(n_estimators=100,random_state=1)
clf2 = MLPClassifier()
#tree = RandomForestClassifier(n_estimators = 500, max_features=None, random_state=0)
#extra = ExtraTreesClassifier(n_estimators=35, random_state=1)
#clf2 = AdaBoostClassifier(base_estimator=extra, n_estimators=100, random_state=0)
#clf2 = Perceptron()
clf3 = RandomForestClassifier(n_estimators = 500, max_depth=12, random_state=0)

lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
#sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, average_probas=False, meta_classifier=lr)

print('---------------StackingClassifier---------------')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['GradientBoost', 
                       'MLP', 
                       'RandomForest',
                       'StackingClassifier']):

    #scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    #print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))    
    clf = clf.fit(features_train[:trainSize,:], y_train[:trainSize])
    
    y_pred = clf.predict_proba(features_train[:trainSize,:])
    y_true = y_train[:trainSize]
    loss = log_loss(y_true, y_pred)
    print("\n[%s] Training: %0.4f" % (label, loss))

    y_pred = clf.predict_proba(features_train[trainSize:,:])
    y_true = y_train[trainSize:]
    loss = log_loss(y_true, y_pred)
    print("[%s] Test:     %0.4f " % (label, loss))

clf = sclf

---------------StackingClassifier---------------

[GradientBoost] Training: 0.5212
[GradientBoost] Test:     0.5246 

[MLP] Training: 0.5414
[MLP] Test:     0.5451 

[RandomForest] Training: 0.4762
[RandomForest] Test:     0.5128 

[StackingClassifier] Training: 0.5034
[StackingClassifier] Test:     0.5589 


In [None]:
# Predict test data, then output to a CSV file
y_pred_test = clf.predict_proba(features_test)

In [None]:
y_pred_test

In [None]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred_test[i][1])+'\n')   #y_pred[i][0]

In [181]:
document_len = [len(t) for t in encoded_docs]
Counter(document_len)

Counter({0: 2,
         1: 10,
         2: 5,
         3: 230,
         4: 1348,
         5: 2862,
         6: 4833,
         7: 7035,
         8: 7788,
         9: 7353,
         10: 6249,
         11: 4535,
         12: 3479,
         13: 2618,
         14: 2392,
         15: 2121,
         16: 1544,
         17: 924,
         18: 774,
         19: 630,
         20: 437,
         21: 284,
         22: 333,
         23: 204,
         24: 178,
         25: 182,
         26: 128,
         27: 103,
         28: 109,
         29: 48,
         30: 42,
         31: 37,
         32: 24,
         33: 20,
         34: 11,
         35: 15,
         36: 9,
         37: 2,
         38: 10,
         39: 7,
         40: 3,
         41: 3,
         42: 3,
         43: 3,
         45: 1,
         46: 3,
         49: 2,
         52: 1,
         53: 1,
         54: 1,
         58: 1,
         59: 1,
         65: 1,
         73: 1})

In [156]:
yaya = sorted(list(set(X_train1[7265]).union(set(X_train2[7265])).difference({0})))
yaya = dict((word,index) for index,word in enumerate(yaya))
w = 2
d = 2
#g1 = terms_to_graph(X_train1[0], w, yaya)
g1,A1,norm1 = terms2graph(X_train1[7265], w, yaya, d)
g2,A2,norm2 = terms2graph(X_train2[7265], w, yaya, d)
norm = norm1 * norm2
graph_similarity = graph_kernel(X_train1[7265],X_train2[7265],A1,A2,norm)
graph_similarity

In [27]:
fea_names = []
#fea_names.append('q1_freq')   
#graph_train = np.array(graph_train)

In [24]:
type(fea_names)

list

In [7]:
np.array(fea_names)

array(['q1_freq', 'q2_freq'], 
      dtype='|S7')

In [None]:
graph classification

In [20]:
# Output graph features to a CSV file
graph_train = np.array(graph_train)
features = pd.DataFrame(graph_train, columns=fea_names)
features.to_csv("feature_Graph_train.csv", index=False)
features.head()
#graph_train = pd.read_csv('feature_Graph_train.csv')
#fea_names = graph_train.columns.tolist()

Unnamed: 0,q1_freq,q2_freq
0,1,2
1,3,4
2,5,6


### Deep Learning CNN

In [253]:
def model_conv1D_(emb_matrix):
    
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=max_size,
        trainable=False
    )
    print (emb_matrix.shape, 'emb_matrix shape')
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(max_size,))    #max_size: maximum document size allowed
    seq2 = Input(shape=(max_size,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    #print (glob1a.shape,glob2a.shape,glob5a.shape)
    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    
    # Add basic features
    basic_shape = basicF_train.shape[1]
    basic_input = Input(shape=(basic_shape,))
    basic_dense = BatchNormalization()(basic_input)
    basic_dense = Dense(64, activation='relu')(basic_dense)
    
    # Add graph feature
    graph_shape = graph_similarity_train.shape[1]
    graph_input = Input(shape=(graph_shape,))
    graph_dense = BatchNormalization()(graph_input)
    graph_dense = Dense(64, activation='relu')(graph_dense)    
    
    # Add the magic features
    ###magic_input = Input(shape=(5,))
    ###magic_dense = BatchNormalization()(magic_input)
    ###magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    #distance_input = Input(shape=(20,))
    #distance_dense = BatchNormalization()(distance_input)
    #distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
    #merge = concatenate([diff, mul, magic_dense, distance_dense])
    #merge = concatenate([diff, mul, magic_dense])
    #merge = concatenate([diff, mul])
    merge = concatenate([diff, mul, basic_dense, graph_dense])

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)
    
    #model = Model(inputs=[seq1, seq2], outputs=pred)
    model = Model(inputs=[seq1, seq2, basic_input, graph_input], outputs=pred)
    #model = Model(inputs=[seq1, seq2, magic_input], outputs=pred)
    #model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    #model.summary()

    return model

### Fit model

In [None]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)

In [None]:
X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit(X_train,
          y_train,
          batch_size = 128,
          epochs = 3,
          verbose = 1,
          validation_split = 0.1)
          #, class_weight={0: class_weights[0], 1: class_weights[1]})

Train on 26432 samples, validate on 53668 samples
Epoch 1/3
  256/26432 [..............................] - ETA: 52:53 - loss: 0.8705 - acc: 0.4727

In [252]:
X_train1.shape,basicF_train.shape, graph_similarity_train.shape

((80100, 73), (80100, 3), (80100, 1))

In [None]:
#X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit([X_train1, X_train2, basicF_train, graph_similarity_train],
          y_train,
          batch_size = 128,
          epochs = 3,
          verbose = 1,
          validation_split = 0.1)
          #, class_weight={0: class_weights[0], 1: class_weights[1]})

((20354, 300), 'emb_matrix shape')


### Output prediction of text data

In [None]:
X_test = [np.array(X_test1),np.array(X_test2)]
y_pred = model.predict(X_test, batch_size=128)

In [None]:
y_pred[:5]

In [None]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred[i][0])+'\n')

### Delete....

In [107]:
# Split training data
train_size = int(N_train * 0.67)
X_trainTrain1 = X_train1[:train_size]
X_trainTest1 = X_train1[train_size:]
X_trainTrain2 = X_train2[:train_size]
X_trainTest2 = X_train2[train_size:]

y_trainTrain = y_train[:train_size]
y_trainTest = y_train[train_size:]
print (len(X_train1), len(X_trainTrain1), len(X_trainTest1))
print (len(X_train2), len(X_trainTrain2), len(X_trainTest2))
print (len(y_train), len(y_trainTrain), len(y_trainTest))

(80100, 53667, 26433)
(80100, 53667, 26433)
(80100, 53667, 26433)


In [None]:
model.fit([np.array(X_trainTrain1),np.array(X_trainTrain2)],
          np.array(y_trainTrain),
          batch_size = 128,
          epochs = 3,
          verbose = 1,
          validation_data = ([np.array(X_trainTest1),np.array(X_trainTest2)], np.array(y_trainTest)))
          #, class_weight={0: 1, 1: 1})

In [11]:
ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 
for qid in texts:
    ids2ind[qid] = len(ids2ind)

vec = TfidfVectorizer()
A = vec.fit_transform(texts.values())

In [12]:
N_train = len(pairs_train)
X_train = np.zeros((N_train, 3))
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    X_train[i,1] = len(texts[q1].split()) + len(texts[q2].split())
    X_train[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

N_test = len(pairs_test)
X_test = np.zeros((N_test, 3))
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    X_test[i,1] = len(texts[q1].split()) + len(texts[q2].split())
    X_test[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

In [13]:
clf = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

In [14]:
X_train.shape

(80100, 3)

In [16]:
N_train = len(pairs_train)
X_train1 = np.zeros((N_train, 4))
X_train1[:,:3] = X_train
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train1[i,3] = symmetric_sentence_similarity(texts[q1], texts[q2])

N_test = len(pairs_test)
X_test1 = np.zeros((N_test, 4))
X_test1[:,:3] = X_test
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test1[i,3] = symmetric_sentence_similarity(texts[q1], texts[q2])

In [17]:
for i in range(len(X_train1[:,3])):
    if X_train1[i,3]<0:
        X_train1[i,3] = X_train1[i,0]

In [18]:
len(y_train)

80100

In [None]:
N_train = len(pairs_train)
X_train2 = np.zeros((N_train, 5))
X_train2[:,:4] = X_train1
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train2[i,4] = similarity(texts[q1], texts[q2], True)

N_test = len(pairs_test)
X_test2 = np.zeros((N_test, 5))
X_test2[:,:4] = X_test1
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test2[i,4] = similarity(texts[q1], texts[q2], True)

In [15]:
# Split training data
N_train = len(pairs_train)
train_size = int(N_train * 0.67)
X_trainTrain = X_train[:train_size, :]
X_trainTest = X_train[train_size:, :]

y_trainTrain = y_train[:train_size]
y_trainTest = y_train[train_size:]
print (X_train1.shape, X_trainTrain.shape, X_trainTest.shape)
print (len(y_train), len(y_trainTrain), len(y_trainTest))

((0, 4), (0, 4), (0, 4))
(80100, 53667, 26433)


In [123]:
clf1 = MLPClassifier()
#clf1 = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
clf1.fit(X_trainTrain, y_trainTrain)   # X_trainTrain[:,(0,3)]
y_pred = clf1.predict_proba(X_trainTest)

In [124]:
from sklearn.metrics import log_loss
#y_true = [0, 0, 1, 1]
#y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
y_true = y_trainTest
log_loss(y_true, y_pred)

0.55977646975724693

In [27]:
# Prediction of the test data, then output to a CSV file
clf1 = MLPClassifier()
clf1.fit(X_train1, y_train)
y_pred = clf1.predict_proba(X_test1)

In [28]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred[i][1])+'\n')

In [None]:
def terms_to_graph(terms, w, vocab):
    '''This function returns an undirected, unweighted graph 
    from a list of terms (the tokens from the pre-processed text) 
    e.g., ['quick','brown','fox'].
    Edges are unweighted representing term co-occurence 
    within a sliding window of fixed size 'w'.
    '''  
    #(1) Terms to graph
    terms = [term for term in terms if term]   #remove the padding zero
        
    v_nb = len(vocab)
    A = np.zeros((v_nb,v_nb))  #adjacency matrix
    D = np.zeros((v_nb,v_nb))
    for term in terms:
        D[vocab[term],vocab[term]] = 1
    # create initial complete graph (first w terms)
    terms_temp = terms[0:w]
    indexes = list(itertools.combinations(range(w), r=2))
    # add edges
    edges = []
    for my_tuple in indexes:
        ii = vocab[terms_temp[my_tuple[0]]]    # index of vertice
        jj = vocab[terms_temp[my_tuple[1]]]
        if ii != jj:
            A[ii, jj] = 1
            A[jj, ii] = 1
            edges.append((ii,jj))
    # then iterate over the remaining terms
    for i in range(w, len(terms)):
        considered_term = terms[i] # term to consider
        terms_temp = terms[(i-w+1):(i+1)] # all terms within sliding window
        
        # edges to try
        candidate_edges = []
        for p in range(w-1):
            candidate_edges.append((terms_temp[p],considered_term))
        
        for try_edge in candidate_edges:          
            if try_edge[1] != try_edge[0]:
            # if not self-edge
                ii = vocab[try_edge[0]]
                jj = vocab[try_edge[1]]
                A[ii, jj] = 1
                A[jj, ii] = 1
                edges.append((ii,jj))
    
    #print A, np.sum(A)
    #print D
    
    # create empty graph
    g = nx.Graph()
    # add vertices
    g.add_nodes_from(sorted(vocab.values()))
    g.add_edges_from(edges)
    
    return(g)