# Linear Models - Logistic Regression and SVM


Importing the necesssary libraries

In [1]:
import pandas as pd
import numpy as numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Importing data

In [2]:
data = pd.read_csv('train.csv')
data = data.dropna(how="any").reset_index(drop=True)

### LR with Unigrams

Obtaining unigram counts using CountVectorizer. Skipping preprocessing (tokenization and removal of ASCII characters) as CountVectorizer does it internally

In [3]:
cnt = CountVectorizer(analyzer='word', ngram_range=(1,1), lowercase=True)
cnt.fit(pd.concat((data['question1'],data['question2'])))
q1 = data['question1']
q2 = data['question2']
q1 = cnt.transform(q1)
q2 = cnt.transform(q2)

In [4]:
import scipy
xx = scipy.sparse.hstack((q1,q2))
yy = data['is_duplicate']
x_train_u, x_test_u, y_train_u, y_test_u = train_test_split(xx, yy,  test_size=0.30, random_state=25)
x_val_u, x_test_u, y_val_u, y_test_u = train_test_split(x_test_u, y_test_u ,test_size=float(1/3), random_state=25)


In [5]:
uni = SGDClassifier(penalty='l2',alpha=0.00001, n_iter_no_change=20)
uni.fit(x_train_u,y_train_u)
y_pred = uni.predict(x_val_u)
acc = accuracy_score(y_val_u, y_pred)
print(acc)
f1 = f1_score(y_val_u,y_pred)
print(f1)

0.7489673254347127
0.6449412258606214


In [6]:
uni = SGDClassifier(penalty='l2',alpha=0.00001, n_iter_no_change=20)
uni.fit(x_train_u,y_train_u)
y_pred = uni.predict(x_test_u)
acc = accuracy_score(y_test_u, y_pred)
print(acc)
f1 = f1_score(y_test_u,y_pred)
print(f1)

0.7473843033466077
0.633430242991996


### LR with Bigrams

In [7]:
data2 = pd.read_csv('./train.csv')
data2 = data2.dropna(how="any").reset_index(drop=True)
cnt = CountVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True)
cnt.fit(pd.concat((data2['question1'],data2['question2'])))
q1 = data2['question1']
q2 = data2['question2']
q1 = cnt.transform(q1)
q2 = cnt.transform(q2)

In [8]:
xx = scipy.sparse.hstack((q1,q2))
yy = data2['is_duplicate']
x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(xx, yy,  test_size=0.30, random_state=25)
x_val_b, x_test_b, y_val_b, y_test_b = train_test_split(x_test_b, y_test_b ,test_size=float(1/3), random_state=25)


In [9]:
bi = SGDClassifier(penalty='l2',alpha=0.00001, n_iter_no_change=20)
bi.fit(x_train_b,y_train_b)
y_pred = bi.predict(x_val_b)
acc = accuracy_score(y_val_b, y_pred)
print(acc)
f1 = f1_score(y_val_b,y_pred)
print(f1)

0.7803185831952312
0.7010652799515323


In [10]:
bi = SGDClassifier(penalty='l2',alpha=0.00001, n_iter_no_change=20)
bi.fit(x_train_b,y_train_b)
y_pred = bi.predict(x_test_b)
acc = accuracy_score(y_test_b, y_pred)
print(acc)
f1 = f1_score(y_test_b,y_pred)
print(f1)

0.7767196814168048
0.7007260551006199


### LR with Trigrams

In [11]:
data3 = pd.read_csv('./train.csv')
data3 = data2.dropna(how="any").reset_index(drop=True)
cnt = CountVectorizer(analyzer='word', ngram_range=(1,3), lowercase=True)
cnt.fit(pd.concat((data3['question1'],data3['question2'])))
q1 = data3['question1']
q2 = data3['question2']
q1 = cnt.transform(q1)
q2 = cnt.transform(q2)
xx = scipy.sparse.hstack((q1,q2))
yy = data3['is_duplicate']

In [12]:

x_train_t, x_test_t, y_train_t, y_test_t = train_test_split(xx, yy,  test_size=0.30, random_state=25)
x_val_t, x_test_t, y_val_t, y_test_t = train_test_split(x_test_t, y_test_t ,test_size=float(1/3), random_state=25)


In [13]:
tri = SGDClassifier(penalty='l2',alpha=0.00001, n_iter_no_change=20)
tri.fit(x_train_t,y_train_t)
y_pred = tri.predict(x_test_t)
acc = accuracy_score(y_test_t, y_pred)
print(acc)
f1 = f1_score(y_test_t,y_pred)
print(f1)

0.7966806005590047
0.7177197802197802


In [14]:
tri = SGDClassifier(penalty='l2',alpha=0.0001, n_iter_no_change=20)
tri.fit(x_train_t,y_train_t)
y_pred = tri.predict(x_val_t)
acc = accuracy_score(y_val_t, y_pred)
print(acc)
f1 = f1_score(y_val_t,y_pred)
print(f1)

0.8020480348264859
0.7016700216208156


α = 0.0001 gives an accuracy of 80.13% and F score = 70.32. Highest F-score = 71.32 for α = 0.00001

In [15]:
tri = SGDClassifier(penalty='l2',alpha=0.0001, n_iter_no_change=50)
tri.fit(x_train_t,y_train_t)
y_pred = tri.predict(x_val_t)
acc = accuracy_score(y_val_t, y_pred)
print(acc)
f1 = f1_score(y_val_t,y_pred)
print(f1)

0.802320116747879
0.7025568499013733


Most accuracy for n = 15

### LR with Trigrams, tuned

In [16]:
tri_t = SGDClassifier(penalty='l2',alpha=0.0001, n_iter_no_change=15)
tri_t.fit(x_train_t,y_train_t)
y_pred = tri_t.predict(x_test_t)
acc = accuracy_score(y_test_t, y_pred)
print(acc)
f1 = f1_score(y_test_t,y_pred)
print(f1)

0.8018006876252195
0.7011970019017788


## SVM


In [17]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import normalize

x_test_u1 = normalize(x_test_u, norm='l1', axis=1)
x_train_u1 = normalize(x_train_u, norm='l1', axis=1)

uni = LinearSVC(C=1)
uni.fit(x_train_u1,y_train_u)
y_pred = uni.predict(x_test_u1)
acc = accuracy_score(y_test_u, y_pred)
print(acc)
f1 = f1_score(y_test_u,y_pred)
print(f1)


0.7491157337554726
0.5994234034990719


### Unigram

In [18]:


x_val_u1 = normalize(x_val_u, norm='l1', axis=1) #normalisation
x_train_u1 = normalize(x_train_u, norm='l1', axis=1)

uni = LinearSVC(C=50, max_iter=1000)
uni.fit(x_train_u1,y_train_u)
y_pred = uni.predict(x_val_u1)
acc = accuracy_score(y_val_u, y_pred)
print(acc)
f1 = f1_score(y_val_u,y_pred)
print(f1)


0.7562145984318187
0.6493150684931507




### Bigram

In [19]:
x_test_b1 = normalize(x_test_b, norm='l1', axis=1)
x_train_b1 = normalize(x_train_b, norm='l1', axis=1)

uni = LinearSVC(C=1, max_iter=1000)
uni.fit(x_train_b1,y_train_b)
y_pred = uni.predict(x_test_b1)
acc = accuracy_score(y_test_b, y_pred)
print(acc)
f1 = f1_score(y_test_b,y_pred)
print(f1)


0.7577481510796705
0.6045704134366924


In [20]:
x_val_b1 = normalize(x_val_b, norm='l1', axis=1)
x_train_b1 = normalize(x_train_b, norm='l1', axis=1)

bi = LinearSVC(C=50)
bi.fit(x_train_b1,y_train_b)
y_pred = bi.predict(x_val_b1)
acc = accuracy_score(y_val_b, y_pred)
print(acc)
f1 = f1_score(y_val_b,y_pred)
print(f1)

0.7989809295307824
0.7078510317060895




### Trigram

Hyperparameter tuning on the validation dataset

In [21]:

x_val_t1 = normalize(x_val_t, norm='l1', axis=1)
x_train_t1 = normalize(x_train_t, norm='l1', axis=1)

tri = LinearSVC(C=50)
tri.fit(x_train_t1,y_train_t)
y_pred = tri.predict(x_val_t1)
acc = accuracy_score(y_val_t, y_pred)
print(acc)
f1 = f1_score(y_val_t,y_pred)
print(f1)

0.8092829404635287
0.7170925902144601




Test set with tuning

In [22]:
from sklearn.metrics import confusion_matrix
x_test_t1 = normalize(x_test_t, norm='l1', axis=1)
x_train_t1 = normalize(x_train_t, norm='l1', axis=1)

tri = LinearSVC(C=50)
tri.fit(x_train_t1,y_train_t)
y_pred = tri.predict(x_test_t1)
acc = accuracy_score(y_test_t, y_pred)
print(acc)
f1 = f1_score(y_test_t,y_pred)
print(f1)

0.8109772687922037
0.7189614592527215


