In [2]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import simhash
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.tokenize import word_tokenize

In [4]:
# Loading the training and testing data
df = pd.read_csv('Data/train.csv')
df['question1'] = df['question1'].apply(str)
df['question2'] = df['question2'].apply(str)
from sklearn.model_selection import train_test_split
seed = 123
train, test = train_test_split(df)


In [5]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

In [6]:
def write_csv(result, id_col, path):
    result_final = pd.DataFrame(data = list(zip(id_col, result)), columns = ['id', 'prediction'])
    result_final.to_csv(path, index = False)

## Simhash

In [5]:
def ham_dist(text1, text2):
    return simhash.Simhash(text1, f = 8).distance(simhash.Simhash(text2, f = 8))

In [6]:
train['sim_dist'] = train.apply(lambda x: ham_dist(x['question1'], x['question2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
train.groupby('is_duplicate').agg('mean')['sim_dist']

is_duplicate
0    3.104921
1    2.578291
Name: sim_dist, dtype: float64

In [8]:
test['sim_dist'] = test.apply(lambda x: ham_dist(x['question1'], x['question2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
result_df = []
for i in test['sim_dist']:
    if i > 2.5:
        result_df.append(0)
    else:
        result_df.append(1)

In [23]:
count = 0
for i in range(len(result_df)):
    if result_df[i] == test['is_duplicate'].to_numpy()[i]:
        count = count +1

In [24]:
count/len(result_df)

0.5908204960770927

In [25]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_df, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_df, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_df, average = 'weighted')))

f1_socre:0.5948597234765983
recall_socre:0.5908204960770927
precision_socre:0.6009494323019785


In [39]:
write_csv(result_df, test['id'], 'results/NLP/quora/simhash.csv')

## NER similarity

In [26]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    if union== 0:
        return 0
    else:
        return float(intersection) / union
    
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                    current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                    named_entity = " ".join(current_chunk)
                    if named_entity not in continuous_chunk:
                            continuous_chunk.append(named_entity)
                            current_chunk = []
            else:
                    continue
    return continuous_chunk

In [27]:
train['ne1'] = train.apply(lambda x:get_continuous_chunks(x['question1']), axis = 1)
train['ne2'] = train.apply(lambda x: get_continuous_chunks(x['question2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
train['jac_sim'] = train.apply(lambda x: jaccard_similarity(x['ne1'], x['ne2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
train.groupby('is_duplicate').agg('mean')['jac_sim']

is_duplicate
0    0.125064
1    0.209102
Name: jac_sim, dtype: float64

In [30]:
test['ne1'] = test.apply(lambda x:get_continuous_chunks(x['question1']), axis = 1)
test['ne2'] = test.apply(lambda x: get_continuous_chunks(x['question2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [31]:
test['jac_sim'] = test.apply(lambda x: jaccard_similarity(x['ne1'], x['ne2']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [32]:
result_ner = []
for i in test['jac_sim']:
    if i > .21:
        result_ner.append(1)
    else:
        result_ner.append(0)

In [34]:
count = 0
for i in range(len(result_ner)):
    if result_ner[i] == test['is_duplicate'].to_numpy()[i]:
        count = count +1

In [35]:
count/len(result_ner)

0.615505624647532

In [37]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_ner, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_ner, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_ner, average = 'weighted')))

f1_socre:0.5822150166370247
recall_socre:0.615505624647532
precision_socre:0.583241235522903


In [40]:
write_csv(result_ner, test['id'], 'results/NLP/quora/ner.csv')

## Cosine Distance

In [7]:
#training and testing
q = list(train['question1']) + list(train['question2']) + list(test['question1']) + list(test['question2'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(q)

#train_sent1 = train['#1 String'] + " " + train['#2 String']
#test_sent = test['#1 String']+ " " + test['#2 String']

# sent_vect_train1 = vectorizer.transform(train['#1 String'])
# sent_vect_train2 = vectorizer.transform(train['#2 String'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=30000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [8]:
def cosine_sim(text1, text2):
    
    tfidf = vectorizer.transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


In [9]:
train['cosine_sim'] = train.apply(lambda x: cosine_sim(x['question1'], x['question2']), axis= 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
train.groupby('is_duplicate').agg('mean')['cosine_sim']

is_duplicate
0    0.420365
1    0.633807
Name: cosine_sim, dtype: float64

In [11]:
test['cosine_sim'] = test.apply(lambda x: cosine_sim(x['question1'], x['question2']), axis= 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
result_cosine = []
for i in test['cosine_sim'].to_numpy():
    if i> .63:
        result_cosine.append(1)
    else:
        result_cosine.append(0)

In [14]:
count = 0
for i in range(len(result_cosine)):
    if result_cosine[i] == test['is_duplicate'].to_numpy()[i]:
        count = count+1

In [15]:
count/len(result_cosine)

0.65315168244734

In [16]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_cosine, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_cosine, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_cosine, average = 'weighted')))

f1_socre:0.6508357523423277
recall_socre:0.65315168244734
precision_socre:0.6491201401116313


In [17]:
write_csv(result_cosine, test['id'], 'results/NLP/quora/cosine.csv')