In [18]:
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [19]:
train_data = train.fillna("")
test_data = test.fillna("")

In [20]:
import string
import gensim

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def text_cleaning(text):
    tokens = list(gensim.utils.tokenize(text, deacc=True, lower=True))
    #find out if converting to lower case helps?
    tokens = [word.lower() for word in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [word.translate(table) for word in tokens]
    stop_words = set(stopwords.words('english'))
    words = [word for word in stripped if not word in stop_words]
    words = [word if word.isalpha() else 'number' for word in words]
    return words

In [21]:
list_of_questions = list(set(train_data['question1'].astype('U') + train_data['question2'].astype('U') + test_data['question1'].astype('U') + test_data['question2'].astype('U'))) 
for i in range(len(list_of_questions)):
    if(type(list_of_questions[i]) is not str):
        list_of_questions[i] = ""

In [22]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

#word2vec = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
word2vec = KeyedVectors.load_word2vec_format('Googleword2vec', binary=True)

In [23]:
train_data = train_data[0:50000]
test_data = test_data[0:100000]
for dataset in [train_data, test_data]:
    question1_tokens = list(range(len(dataset)))
    question2_tokens = list(range(len(dataset)))
    for idx, row in dataset.iterrows():
        question1_tokens[idx] = text_cleaning(row['question1'])
        question2_tokens[idx] = text_cleaning(row['question2'])
    dataset.loc[:, 'question1_tokenized'] = question1_tokens
    dataset.loc[:, 'question2_tokenized'] = question2_tokens

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(value, corpus):
    count_vector = CountVectorizer()
    count_vector.fit_transform(corpus)
    return count_vector.transform(value).toarray()

In [25]:
for dataset in [train_data, test_data]:
    question1_bow = list(range(len(dataset)))
    question2_bow = list(range(len(dataset)))
    for idx, row in dataset.iterrows():
        corpus = [row['question1'], row['question2']]
        question1_bow[idx] = bag_of_words(row['question1_tokenized'], corpus)
        question2_bow[idx] = bag_of_words(row['question2_tokenized'], corpus)
    dataset.loc[:, 'question1_bag_vector'] = question1_bow
    dataset.loc[:, 'question2_bag_vector'] = question2_bow

In [26]:
def jacobi_distance(value1, value2):
    if(len((set(value1).union(set(value2)))) == 0):
        return 0
    else:
        set_intersect = len(set(value1).intersection(set(value2)))
        set_union = len(set(value1).union(set(value2)))
        return (set_intersect)/(set_union)

In [27]:
for dataset in [train_data, test_data]:
    distance = list(range(len(dataset)))
    for idx, row in dataset.iterrows():
        distance[idx] = jacobi_distance(row['question1_tokenized'], row['question2_tokenized'])
    dataset.loc[:, 'jacobi_distance'] = distance

In [28]:
import editdistance

for dataset in [train_data, test_data]:
    edit_dist = list(range(len(dataset)))
    for idx, row in dataset.iterrows():
        edit_dist[idx] = editdistance.eval(row['question1_tokenized'], row['question2_tokenized'])
    dataset.loc[:, 'edit_distance'] = edit_dist

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TFIDF
tfidf = TfidfVectorizer()
tfidf.fit_transform(list_of_questions)
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [30]:
embedding_dim = 300
def summerizeWordVecs(words):
    question_rep = np.zeros(embedding_dim)
    if(len(words) == 0):
        return question_rep
    else:
        for word in words:
            if word in word2vec.vocab:
                word_vector = word2vec.get_vector(word)
            else:
                word_vector = [np.random.randn(1) for i in range(embedding_dim)]
            try:
                idf = word2tfidf[str(word)]
            except:
                idf = 0
            for i in range(embedding_dim):
                question_rep[i] = question_rep[i] + word_vector[i]*idf
        return question_rep/len(words)

In [31]:
for dataset in [train_data, test_data]:
    dataset.loc[:, 'question1_summerized'] = dataset['question1']
    dataset.loc[:, 'question2_summerized'] = dataset['question2']
    for idx, row in dataset.iterrows():
        dataset.at[idx, 'question1_summerized'] = summerizeWordVecs(row['question1_tokenized'])
        dataset.at[idx, 'question2_summerized'] = summerizeWordVecs(row['question2_tokenized'])

In [32]:
from scipy import spatial

def find_cosine_similarity(value1, value2):
    productValue = sum(value1*value2)
    if(productValue != 0):
        return 1 - spatial.distance.cosine(value1, value2)
    else:
        return 0

In [33]:
for dataset in [train_data, test_data]:
    cs_w2v = list(range(len(dataset)))
    cs_bow = list(range(len(dataset)))
    for idx, row in dataset.iterrows():
        cs_w2v[idx] = find_cosine_similarity(row['question1_summerized'], row['question2_summerized'])
        if((len(row['question1_bag_vector']) == 0) | (len(row['question2_bag_vector']) == 0)):
            cs_bow[idx] = 0  
        else:
            cs_bow[idx] = find_cosine_similarity(np.array(row['question1_bag_vector']).mean(axis=0), np.array(row['question2_bag_vector']).mean(axis=0))
    dataset.loc[:, 'cosine_sim_w2v'] = cs_w2v
    dataset.loc[:, 'cosine_sim_bow'] = cs_bow

In [35]:
train_data.to_csv('preprocessed_train_data_1.csv')
test_data.to_csv('preprocessed_test_data_1.csv')

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("preprocessed_train_data.csv")
test_data = pd.read_csv("preprocessed_test_data.csv")

In [2]:
features = ['similarity']

In [30]:
features = ['cosine_sim_w2v','cosine_sim_bow','edit_distance','jacobi_distance']

In [3]:
from sklearn.model_selection import train_test_split
train_set, valid_set, train_labels, valid_labels = train_test_split(train_data[features], train_data['is_duplicate'], test_size=0.3, random_state=4327)

In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from xgboost import XGBClassifier

svm_clf = SVC(probability=True)
svm_clf.fit(train_set, train_labels)

#randforest_clf = RandomForestClassifier(n_estimators=300,min_samples_split=3,oob_score=True,n_jobs=-1, random_state = 4273, max_depth=5, bootstrap=True)
#randforest_clf.fit(train_set, train_labels)

#adaboost_clf = AdaBoostClassifier()
#adaboost_clf.fit(train_set, train_labels)

#xgboost_clf = XGBClassifier()
#xgboost_clf.fit(train_set, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
from sklearn.metrics import classification_report

predict_valid_set = svm_clf.predict(valid_set)
predict_valid_set = predict_valid_set.reshape(len(predict_valid_set),1)
print("Training error")
print(classification_report(train_labels, svm_clf.predict(train_set)))
print("Validation error")
print(classification_report(valid_labels, predict_valid_set))

Training error
             precision    recall  f1-score   support

          0       0.70      0.82      0.75     21959
          1       0.57      0.40      0.47     13041

avg / total       0.65      0.67      0.65     35000

Validation error
             precision    recall  f1-score   support

          0       0.70      0.82      0.76      9392
          1       0.58      0.40      0.47      5608

avg / total       0.65      0.67      0.65     15000



In [18]:
from sklearn import model_selection

kfold = model_selection.KFold(n_splits=10, random_state=6)

scoring = 'accuracy'
results = model_selection.cross_val_score(svm_clf, train_set, train_labels, cv=kfold, scoring= scoring)

print("Accuracy: %.3f"%results.mean())

results = model_selection.cross_val_score(svm_clf, valid_set, valid_labels, cv=kfold, scoring= scoring)
print("Accuracy: %.3f"%results.mean())

Accuracy: 0.665
Accuracy: 0.666


In [19]:
from sklearn import model_selection
scoring = 'neg_log_loss'

kfold = model_selection.KFold(n_splits=10, random_state=6)
results = model_selection.cross_val_score(svm_clf, train_set, train_labels, cv=kfold, scoring = scoring)

print("Logloss: %.3f"%results.mean())

results = model_selection.cross_val_score(svm_clf, valid_set, valid_labels, cv=kfold, scoring = scoring)

print("Logloss: %.3f"%results.mean())

Logloss: -0.599
Logloss: -0.596
