In [1]:
import pandas as pd
import numpy as np
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv(r"../short-text-similarity/data/train.csv")
data.head()

Unnamed: 0,pid,sentence1,sentence2,label
0,4665,do you accept broadband bills,Broadband bill,1
1,26465,I am getting better fare for this flight,Check flight rate,0
2,57679,Flight from delhi to srinagar,check price for flight from mumbai to pune,1
3,22043,Availability of food,food available for guest,1
4,6522,This is my ID only,Can't find order ID.,0


In [3]:
def cleaner(sent):
    sent = sent.lower()
    stop_words = list(punctuation)
    tokens = word_tokenize(sent)
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    stem_tokens = [stemmer.stem(token) for token in tokens]
    sent = ' '.join(stem_tokens)
    
    return sent

In [4]:
data['clean_sentence1'] = data['sentence1'].apply(cleaner)
data['clean_sentence2'] = data['sentence2'].apply(cleaner)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[['clean_sentence1', 'clean_sentence2']], data['label'], 
                                                    test_size=0.33, random_state=42)

In [6]:
%%time
tfidf_sent1 = TfidfVectorizer()
tfidf_sent2 = TfidfVectorizer()
tfidf_vect_sent1_train = tfidf_sent1.fit_transform(X_train['clean_sentence1'])
tfidf_vect_sent2_train = tfidf_sent2.fit_transform(X_train['clean_sentence2'])
tfidf_vect_train = np.concatenate([tfidf_vect_sent1_train.toarray(), tfidf_vect_sent2_train.toarray()], axis=1)

Wall time: 1.15 s


In [7]:
print('Shape of Tf-Idf vector of sentence1 :', tfidf_vect_sent1_train.toarray().shape)
print('Shape of Tf-Idf vector of sentence2 :', tfidf_vect_sent2_train.toarray().shape)
print('Shape of Tf-Idf vector of cobination of both sentences :', tfidf_vect_train.shape)

Shape of Tf-Idf vector of sentence1 : (11419, 1877)
Shape of Tf-Idf vector of sentence2 : (11419, 1916)
Shape of Tf-Idf vector of cobination of both sentences : (11419, 3793)


In [8]:
tfidf_vect_sent1_test = tfidf_sent1.transform(X_test['clean_sentence1'])
tfidf_vect_sent2_test = tfidf_sent2.transform(X_test['clean_sentence2'])
tfidf_vect_test = np.concatenate([tfidf_vect_sent1_test.toarray(), tfidf_vect_sent2_test.toarray()], axis=1)

In [9]:
print('Shape of Tf-Idf vector of sentence1 :', tfidf_vect_sent1_test.toarray().shape)
print('Shape of Tf-Idf vector of sentence2 :', tfidf_vect_sent2_test.toarray().shape)
print('Shape of Tf-Idf vector of cobination of both sentences :', tfidf_vect_test.shape)

Shape of Tf-Idf vector of sentence1 : (5625, 1877)
Shape of Tf-Idf vector of sentence2 : (5625, 1916)
Shape of Tf-Idf vector of cobination of both sentences : (5625, 3793)


In [10]:
%%time
log_clf = LogisticRegression(random_state=0).fit(tfidf_vect_train, y_train)
log_pred = log_clf.predict(tfidf_vect_test)



Wall time: 570 ms


In [11]:
log_f1 = f1_score(y_test, log_pred, average='weighted')
print('F1 score using Logistic Regression :', log_f1)
print('Confusion Matrix using Logistic Regression :')
print(confusion_matrix(y_test, log_pred))

F1 score using Logistic Regression : 0.8049446174714989
Confusion Matrix using Logistic Regression :
[[3974  211]
 [ 792  648]]


## Applying Logistic Regression on test Data

In [12]:
data_test = pd.read_csv(r"../short-text-similarity/data/test.csv")
data_test.head()

Unnamed: 0,pid,sentence1,sentence2
0,54615,As an investor how do I avail benefit with res...,Stamp Duty
1,53768,cash back nahi aaya,Muje cash back nahi mila
2,26567,LPG Storage Sytem Warranty,Is there any license required for LPG storage ...
3,4028,i want discount on flight tickets,I want discounts on train booking
4,5145,nearby bus stop,bus stop


In [13]:
data_test['clean_sentence1'] = data_test['sentence1'].apply(cleaner)
data_test['clean_sentence2'] = data_test['sentence2'].apply(cleaner)

In [14]:
tfidf_vect_sent1_test_data = tfidf_sent1.transform(data_test['clean_sentence1'])
tfidf_vect_sent2_test_data = tfidf_sent2.transform(data_test['clean_sentence2'])
tfidf_vect_test_data = np.concatenate([tfidf_vect_sent1_test_data.toarray(), tfidf_vect_sent2_test_data.toarray()], axis=1)

In [15]:
print('Shape of Tf-Idf vector of sentence1 :', tfidf_vect_sent1_test_data.toarray().shape)
print('Shape of Tf-Idf vector of sentence2 :', tfidf_vect_sent2_test_data.toarray().shape)
print('Shape of Tf-Idf vector of cobination of both sentences :', tfidf_vect_test_data.shape)

Shape of Tf-Idf vector of sentence1 : (7305, 1877)
Shape of Tf-Idf vector of sentence2 : (7305, 1916)
Shape of Tf-Idf vector of cobination of both sentences : (7305, 3793)


In [16]:
log_pred_test_data = log_clf.predict(tfidf_vect_test_data)

In [18]:
submit_df = pd.DataFrame(data = data_test['pid'], columns = ['pid'])

In [19]:
submit_df['label'] = log_pred_test_data

In [20]:
# submit_df.to_csv(r"../short-text-similarity/output/base_logreg.csv", index = False)

Let's built vectors considering context

In [21]:
%%time
tfidf_bigram_sent1 = TfidfVectorizer(ngram_range = (1, 2))
tfidf_bigram_sent2 = TfidfVectorizer(ngram_range = (1, 2))
tfidf_bigram_sent1_train = tfidf_bigram_sent1.fit_transform(X_train['clean_sentence1'])
tfidf_bigram_sent2_train = tfidf_bigram_sent2.fit_transform(X_train['clean_sentence2'])
tfidf_bigram_train = np.concatenate([tfidf_bigram_sent1_train.toarray(), tfidf_bigram_sent2_train.toarray()], axis=1)

Wall time: 4.12 s


In [22]:
print('Shape of Tf-Idf bigram vector of sentence1 :', tfidf_bigram_sent1_train.toarray().shape)
print('Shape of Tf-Idf bigram vector of sentence2 :', tfidf_bigram_sent2_train.toarray().shape)
print('Shape of Tf-Idf bigram vector of cobination of both sentences :', tfidf_bigram_train.shape)

Shape of Tf-Idf bigram vector of sentence1 : (11419, 9577)
Shape of Tf-Idf bigram vector of sentence2 : (11419, 9878)
Shape of Tf-Idf bigram vector of cobination of both sentences : (11419, 19455)


In [23]:
tfidf_bigram_sent1_test = tfidf_bigram_sent1.transform(X_test['clean_sentence1'])
tfidf_bigram_sent2_test = tfidf_bigram_sent2.transform(X_test['clean_sentence2'])
tfidf_bigram_test = np.concatenate([tfidf_bigram_sent1_test.toarray(), tfidf_bigram_sent2_test.toarray()], axis=1)

In [24]:
print('Shape of Tf-Idf bigram vector of sentence1 :', tfidf_bigram_sent1_test.toarray().shape)
print('Shape of Tf-Idf bigram vector of sentence2 :', tfidf_bigram_sent2_test.toarray().shape)
print('Shape of Tf-Idf bigram vector of cobination of both sentences :', tfidf_bigram_test.shape)

Shape of Tf-Idf bigram vector of sentence1 : (5625, 9577)
Shape of Tf-Idf bigram vector of sentence2 : (5625, 9878)
Shape of Tf-Idf bigram vector of cobination of both sentences : (5625, 19455)


In [25]:
%%time
log_clf_bigram = LogisticRegression(random_state=0).fit(tfidf_bigram_train, y_train)
log_bigram_pred = log_clf_bigram.predict(tfidf_bigram_test)



Wall time: 1.54 s


In [26]:
log_f1 = f1_score(y_test, log_bigram_pred, average='weighted')
print('F1 score using Logistic Regression :', log_f1)
print('Confusion Matrix using Logistic Regression :')
print(confusion_matrix(y_test, log_bigram_pred))

F1 score using Logistic Regression : 0.8332107531889804
Confusion Matrix using Logistic Regression :
[[4069  116]
 [ 735  705]]


After consider context (Bigram) we can see 3% boost in F1 Score.

In [27]:
%%time
deci_clf_bigram = DecisionTreeClassifier(random_state=0).fit(tfidf_bigram_train, y_train)
deci_bigram_pred = deci_clf_bigram.predict(tfidf_bigram_test)

Wall time: 10min 52s


In [28]:
deci_f1 = f1_score(y_test, deci_bigram_pred, average='weighted')
print('F1 score using Decision Tree Classifier :', deci_f1)
print('Confusion Matrix using Decision Tree Classifier :')
print(confusion_matrix(y_test, deci_bigram_pred))

F1 score using Decision Tree Classifier : 0.8562878301322314
Confusion Matrix using Decision Tree Classifier :
[[3845  340]
 [ 457  983]]


## Let's predict labels on Test Data using Decision Tree

In [29]:
tfidf_vect_sent1_test_data = tfidf_bigram_sent1.transform(data_test['clean_sentence1'])
tfidf_vect_sent2_test_data = tfidf_bigram_sent2.transform(data_test['clean_sentence2'])
tfidf_vect_test_data = np.concatenate([tfidf_vect_sent1_test_data.toarray(), tfidf_vect_sent2_test_data.toarray()], axis=1)

In [30]:
print('Shape of Tf-Idf vector of sentence1 :', tfidf_vect_sent1_test_data.toarray().shape)
print('Shape of Tf-Idf vector of sentence2 :', tfidf_vect_sent2_test_data.toarray().shape)
print('Shape of Tf-Idf vector of cobination of both sentences :', tfidf_vect_test_data.shape)

Shape of Tf-Idf vector of sentence1 : (7305, 9577)
Shape of Tf-Idf vector of sentence2 : (7305, 9878)
Shape of Tf-Idf vector of cobination of both sentences : (7305, 19455)


In [31]:
deci_bigram_pred_test_data = deci_clf_bigram.predict(tfidf_vect_test_data)

In [32]:
submit_df = pd.DataFrame(data = data_test['pid'], columns = ['pid'])
submit_df['label'] = deci_bigram_pred_test_data
# submit_df.to_csv(r"../short-text-similarity/output/deci_bigram.csv", index = False)

In [35]:
# data.to_csv(r"../short-text-similarity/data/clean_data.csv", index = False)