In [2]:
import pandas as pd 
# conda install -c anaconda gensim
import gensim
import string
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score

In [3]:
# import main file into a dataframe
main_df = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t', header=0)

In [4]:
#Load Google's pre-trained word2vec model
#https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
#test our model on a random word
test = model.wv['computer']
print test

[  1.07421875e-01  -2.01171875e-01   1.23046875e-01   2.11914062e-01
  -9.13085938e-02   2.16796875e-01  -1.31835938e-01   8.30078125e-02
   2.02148438e-01   4.78515625e-02   3.66210938e-02  -2.45361328e-02
   2.39257812e-02  -1.60156250e-01  -2.61230469e-02   9.71679688e-02
  -6.34765625e-02   1.84570312e-01   1.70898438e-01  -1.63085938e-01
  -1.09375000e-01   1.49414062e-01  -4.65393066e-04   9.61914062e-02
   1.68945312e-01   2.60925293e-03   8.93554688e-02   6.49414062e-02
   3.56445312e-02  -6.93359375e-02  -1.46484375e-01  -1.21093750e-01
  -2.27539062e-01   2.45361328e-02  -1.24511719e-01  -3.18359375e-01
  -2.20703125e-01   1.30859375e-01   3.66210938e-02  -3.63769531e-02
  -1.13281250e-01   1.95312500e-01   9.76562500e-02   1.26953125e-01
   6.59179688e-02   6.93359375e-02   1.02539062e-02   1.75781250e-01
  -1.68945312e-01   1.21307373e-03  -2.98828125e-01  -1.15234375e-01
   5.66406250e-02  -1.77734375e-01  -2.08984375e-01   1.76757812e-01
   2.38037109e-02  -2.57812500e-01

In [6]:
#get the feature length of word
vec_len = len(test)
print vec_len


300


In [7]:
"""
Function that takes a sentence as an argument, and returns a feature vector(300x1 array) out of it.
We split the sentence into individual words, apply word2vec model on each of them, and take an average of all
vectors, which represents the vector for our sentence."""

def sentence2vec(sentence):
    # clean the punctuation marks from the sentence
    clean_sentence = sentence.translate(None, string.punctuation)
    # split the senctence into a list of words
    word_list = clean_sentence.split()
    # get number of words in sentence
    sen_length = len(word_list)
    # initialize the empty array that represens the sentence vector
    sentence_vec_sum = [0]*300
    # loop through the words
    for word in word_list: 
        try: 
            # get the word vector from the pretrained model
            word_vec = model.wv[word]
            # 
            sentence_vec_sum[:] = [sentence_vec_sum[i] + word_vec[i] for i in xrange(len(sentence_vec_sum))]
        except KeyError:
            # this will happen when the word doesn't exist in the vocabulary of the original model. This typically
            # includes stopwords, and we won't include them in our average
            sen_length -= 1
    
    # since we need the average, we need to divide each element in the array by the size of the sentence
    sentence_vec = [element / sen_length for element in sentence_vec_sum]
   
    
    return sentence_vec
        

In [8]:
str = """What is the step by step guide to invest in; share! market, in. india?"""
sentence2vec(str)

[-0.056272066556490384,
 0.031193659855769232,
 0.041414701021634616,
 0.008253831129807692,
 -0.058158287635216348,
 -0.017897385817307692,
 0.044771634615384616,
 -0.16353665865384615,
 0.058265099158653848,
 0.080697866586538464,
 -0.063157301682692304,
 -0.05690354567307692,
 0.063218336838942304,
 0.037184495192307696,
 -0.11766242980957031,
 0.051584097055288464,
 0.010779747596153846,
 0.099346454326923073,
 -0.0057091346153846151,
 -0.054086538461538464,
 0.039355351374699518,
 0.063739483173076927,
 -6.5730168269230766e-05,
 -0.055944002591646634,
 0.026181147648737982,
 0.060575045072115384,
 -0.079450167142427891,
 0.092998798076923073,
 0.0120086669921875,
 0.02837195763221154,
 -0.0087479811448317301,
 -0.044987605168269232,
 -0.10189115084134616,
 0.044205885667067304,
 -0.042015662560096152,
 0.01223285381610577,
 -0.057448167067307696,
 0.002422626201923077,
 0.11459585336538461,
 0.071176382211538464,
 0.016981858473557692,
 0.023888221153846152,
 0.065157376802884609,

In [9]:
# grab the first row from the dataframe 
main_df.loc[0].values[4]

'What is the step by step guide to invest in share market?'

In [10]:
# write a function to parse dataframe into np array
"""Function that takes a df row as an argument and converts into two np array rows"""
def df_to_np(df_row):
    # vector from sentence 1
    try:
        vec_1 = sentence2vec(df_row[3])
    except ZeroDivisionError:
        vec_1 = [0]*300
    # vector from sentence 2
    try:
        vec_2 = sentence2vec(df_row[4])
    except ZeroDivisionError:
        vec_2 = [0]*300
    # absolute difference between the vectors
    row_vec = [abs(x-y) for x,y in zip(vec_1, vec_2)]
    # get the is_duplicate out of the array
    is_dupe = df_row[5]
    
    return row_vec, is_dupe

In [17]:
# Initialize np arrays
data_size = 10000
X = np.zeros((data_size,300))
Y = np.zeros((data_size,1))
x = np.zeros((data_size,300))
y = np.zeros((data_size,1))

# Traverse the dataframe and add the results into the relevant numpy arrays
for i in xrange(data_size):
    row_vec, is_dupe = df_to_np(main_df.loc[i])
    X[i] = row_vec
    Y[i] = is_dupe
for i in xrange(int(0.6*data_size)):
    x[i] = X[i]
    y[i] = Y[i]
    

  if word in self.vocab:


In [23]:
# Time for some ML play!

# Initialize the model
clf = svm.SVC()
# Train it on the dataset
clf.fit(x, y)

  y_ = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
predicted = clf.predict(X[int(0.6*data_size):data_size][:])
y_test = Y[int(0.6*data_size):data_size]
print accuracy_score(y_test, predicted)

0.74449


In [25]:
# https://stackoverflow.com/questions/19629331/python-how-to-find-accuracy-result-in-svm-text-classifier-algorithm-for-multil