In [47]:
import numpy as np
import codecs
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
from gensim.models.word2vec import Word2Vec

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, load_model, Sequential
from keras import backend as K
from keras.layers import Input, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.layers import GlobalAveragePooling1D, Lambda
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [13]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk import word_tokenize, pos_tag

In [14]:
texts = {}
pairs_train = []
pairs_test = []
y_train = []
with codecs.open('train.csv','r', 'UTF-8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4]
        pairs_train.append([l[1],l[2]])
        y_train.append(int(l[5][:-1])) # [:-1] is just to remove formatting at the end

with codecs.open('test.csv','r', encoding='UTF-8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4][:-1]
        pairs_test.append([l[1], l[2]])

In [15]:
len(texts)  #type(texts)

58940

In [17]:
docs = texts.values()
# prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
encoded_docs = tokenizer.texts_to_sequences(docs)
#print (docs[0])
print (encoded_docs[0])
#tokenizer.word_index

# pad documents to the same length
max_size = max([len(t) for t in encoded_docs])
padded_docs = pad_sequences(encoded_docs, maxlen=max_size, padding='post')
print (max_size, 'max_size')
print (padded_docs[0])

[2, 10, 1, 22, 11, 1, 16, 1154]
73 max_size
[   2   10    1   22   11    1   16 1154    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [18]:
max([len(t) for t in docs])

337

In [19]:
max([len(t) for t in encoded_docs])

73

In [21]:
index_to_word = dict((v,k) for k, v in tokenizer.word_index.items())
# stpwds = [index_to_word[idx] for idx in range(1,stpwd_thd)]
# print('stopwords are:',stpwds)
x_full_words = [[index_to_word[idx] for idx in rev if idx!=0] for rev in encoded_docs]
all_words = [word for rev in x_full_words for word in rev]
print (x_full_words[0])
print (len(all_words),'words')
print (len(list(set(all_words))),'unique words')
print (len(tokenizer.word_index), 'tokenizer.word_index')

['what', 'are', 'the', 'some', 'of', 'the', 'best', 'novels']
599985 words
20354 unique words
20354 tokenizer.word_index


In [22]:
# texts_token is used for model fit
texts_token = dict((ID,int(i)) for i,(ID,s) in enumerate(texts.items()))
#padded_docs[texts_token[ID]]

In [24]:
# Use pre-trained
# initialize word vectors
word_vector_dim = int(3e2)
print (word_vector_dim, 'word_vector_dim')
word_vectors = Word2Vec(size=word_vector_dim, min_count=1)
# create entries for the words in our vocabulary
word_vectors.build_vocab(x_full_words)
# sanity check
##assert(len(list(set(all_words))) == len(word_vectors.wv.vocab))
word_vectors.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

300 word_vector_dim


In [25]:
norms = [np.linalg.norm(word_vectors[word]) for word in list(word_vectors.wv.vocab)] # in Python 2.7: word_vectors.wv.vocab.keys()
idxs_zero_norms = [idx for idx, norm in enumerate(norms) if norm<0.05]
no_entry_words = [list(word_vectors.wv.vocab)[idx] for idx in idxs_zero_norms]
print('# of vocab words w/o a Google News entry:',len(no_entry_words))

  """Entry point for launching an IPython kernel.


# of vocab words w/o a Google News entry: 5352


In [26]:
print (len(list(set(all_words))),'unique words')
print (len(word_vectors.wv.vocab), 'word_vectors.wv.vocab')
print (len(tokenizer.word_index), 'tokenizer.word_index')

20354 unique words
20354 word_vectors.wv.vocab
20354 tokenizer.word_index


In [27]:
# create numpy array of embeddings 
max_features = len(word_vectors.wv.vocab)+1  # nb of unique words
print (max_features, 'max_features')
embeddings = np.zeros((max_features, word_vector_dim))
for word, idx in tokenizer.word_index.items():
    embeddings[idx,] = word_vectors[word]
print('embeddings created')

20355 max_features
embeddings created


  


### Deep Learning CNN

In [64]:
def model_conv1D_(emb_matrix):
    
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=max_size,
        trainable=False
    )
    print (emb_matrix.shape, 'emb_matrix shape')
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(max_size,))
    seq2 = Input(shape=(max_size,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    #print (glob1a.shape,glob2a.shape,glob5a.shape)
    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])

    # Add the magic features
    ###magic_input = Input(shape=(5,))
    ###magic_dense = BatchNormalization()(magic_input)
    ###magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    #distance_input = Input(shape=(20,))
    #distance_dense = BatchNormalization()(distance_input)
    #distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
    #merge = concatenate([diff, mul, magic_dense, distance_dense])
    #merge = concatenate([diff, mul, magic_dense])
    merge = concatenate([diff, mul])

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[seq1, seq2], outputs=pred)
    #model = Model(inputs=[seq1, seq2, magic_input], outputs=pred)
    #model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    #model.summary()

    return model

In [31]:
N_train = len(pairs_train)  ####padded_docs[texts_token[ID]]
X_train1 = []
X_train2 = []
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train1 = X_train1 + [padded_docs[texts_token[q1]]]
    X_train2 = X_train2 + [padded_docs[texts_token[q2]]]

N_test = len(pairs_test)
X_test1 = []
X_test2 = []
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test1 = X_test1 + [padded_docs[texts_token[q1]]]
    X_test2 = X_test2 + [padded_docs[texts_token[q2]]]

In [None]:
print (np.array(X_train1).shape, np.array(X_train2).shape, max_size)
print (np.array(X_test1).shape, np.array(X_test2).shape)

In [32]:
np.array([np.array(X_test1), np.array(X_test2)]).shape

(2, 20179, 73)

In [90]:
wordsFreq = Counter(all_words)
#print (wordsFreq)
#wordsFreq['the']

In [92]:
len(wordsFreq)

20354

### Magic features

### Fit model

In [49]:
Counter(y_train)

Counter({0: 28161, 1: 51939})

In [50]:
np.unique(y_train)

array([0, 1])

In [51]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)

In [52]:
class_weights

array([ 1.42217961,  0.77109686])

In [None]:
[]

In [55]:
0.77109686*51939

40049.99981154

In [56]:
1.42217961*28161

40049.99999721

In [67]:
X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit(X_train,
          y_train,
          batch_size = 128,
          epochs = 5,         #3
          verbose = 1,
          validation_split = 0.1,
          class_weight={0: class_weights[0], 1: class_weights[1]})

(20355, 300) emb_matrix shape
Train on 72090 samples, validate on 8010 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a5e03a828>

In [59]:
X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit(X_train,
          y_train,
          batch_size = 128,
          epochs = 3,         #3
          verbose = 1,
          validation_split = 0.1,
          class_weight={0: class_weights[0], 1: class_weights[1]})

Train on 72090 samples, validate on 8010 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a3b43f940>

In [66]:
X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit(X_train,
          y_train,
          batch_size = 128,
          epochs = 3,         #3
          verbose = 1,
          validation_split = 0.1)
          #, class_weight={0: 1, 1: 1})

(20355, 300) emb_matrix shape
Train on 72090 samples, validate on 8010 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a4ff2edd8>

In [71]:
X_train = [np.array(X_train1),np.array(X_train2)]
model = model_conv1D_(embeddings)
model.fit(X_train,
          y_train,
          batch_size = 128,
          epochs = 6,         #3
          verbose = 1,
          validation_split = 0.1)
          #, class_weight={0: 1, 1: 1})

(20355, 300) emb_matrix shape
Train on 72090 samples, validate on 8010 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x1a5b468d68>

In [72]:
X_test = [np.array(X_test1),np.array(X_test2)]
y_pred = model.predict(X_test, batch_size=128)

In [73]:
y_pred[:5]

array([[ 0.98650408],
       [ 0.99972719],
       [ 0.99992728],
       [ 0.99705744],
       [ 0.99415094]], dtype=float32)

In [74]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred[i][0])+'\n')

In [11]:
ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 
for qid in texts:
    ids2ind[qid] = len(ids2ind)

vec = TfidfVectorizer()
A = vec.fit_transform(texts.values())

In [12]:
N_train = len(pairs_train)
X_train = np.zeros((N_train, 3))
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    X_train[i,1] = len(texts[q1].split()) + len(texts[q2].split())
    X_train[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

N_test = len(pairs_test)
X_test = np.zeros((N_test, 3))
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    X_test[i,1] = len(texts[q1].split()) + len(texts[q2].split())
    X_test[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))

In [13]:
clf = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

In [14]:
X_train.shape

(80100, 3)

In [16]:
N_train = len(pairs_train)
X_train1 = np.zeros((N_train, 4))
X_train1[:,:3] = X_train
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train1[i,3] = symmetric_sentence_similarity(texts[q1], texts[q2])

N_test = len(pairs_test)
X_test1 = np.zeros((N_test, 4))
X_test1[:,:3] = X_test
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test1[i,3] = symmetric_sentence_similarity(texts[q1], texts[q2])

In [17]:
for i in range(len(X_train1[:,3])):
    if X_train1[i,3]<0:
        X_train1[i,3] = X_train1[i,0]

In [18]:
len(y_train)

80100

In [None]:
N_train = len(pairs_train)
X_train2 = np.zeros((N_train, 5))
X_train2[:,:4] = X_train1
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train2[i,4] = similarity(texts[q1], texts[q2], True)

N_test = len(pairs_test)
X_test2 = np.zeros((N_test, 5))
X_test2[:,:4] = X_test1
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test2[i,4] = similarity(texts[q1], texts[q2], True)

In [15]:
# Split training data
N_train = len(pairs_train)
train_size = int(N_train * 0.67)
X_trainTrain = X_train[:train_size, :]
X_trainTest = X_train[train_size:, :]

y_trainTrain = y_train[:train_size]
y_trainTest = y_train[train_size:]
print (X_train1.shape, X_trainTrain.shape, X_trainTest.shape)
print (len(y_train), len(y_trainTrain), len(y_trainTest))

((0, 4), (0, 4), (0, 4))
(80100, 53667, 26433)


In [123]:
clf1 = MLPClassifier()
#clf1 = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
clf1.fit(X_trainTrain, y_trainTrain)   # X_trainTrain[:,(0,3)]
y_pred = clf1.predict_proba(X_trainTest)

In [124]:
from sklearn.metrics import log_loss
#y_true = [0, 0, 1, 1]
#y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
y_true = y_trainTest
log_loss(y_true, y_pred)

0.55977646975724693

In [None]:
qjqj

In [None]:
qjqj

In [27]:
# Prediction of the test data, then output to a CSV file
clf1 = MLPClassifier()
clf1.fit(X_train1, y_train)
y_pred = clf1.predict_proba(X_test1)

In [28]:
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred.shape[0]):
        f.write(str(i)+','+str(y_pred[i][1])+'\n')

In [20]:
# Save the train features
with open("save_train.csv", 'w') as f:
    f.write("TFIdf, len, len1, Wordnet\n")
    for i in range(N_train):
        f.write(str(X_train1[i,0])+','+str(X_train1[i,1])+','+str(X_train1[i,2])+','+str(X_train1[i,3])+'\n')

In [21]:
import pandas as pd
df_train = pd.read_csv('save_train.csv')
df_train.values

array([[  0.58723412,  22.        ,   6.        ,   0.88541667],
       [  0.44438048,  21.        ,   3.        ,   0.69305556],
       [  0.47480977,  26.        ,   6.        ,   0.890625  ],
       ..., 
       [  0.62334832,  13.        ,   1.        ,   0.70634921],
       [  0.71035487,  38.        ,   2.        ,   0.95833333],
       [  0.6320477 ,  18.        ,   0.        ,   0.46666667]])

In [None]:
X_train1 = df_train.values