In [4]:
# Importing required packages
# We are using the random class in NumPy package
# The data is imported as Pandas DataFrame
# The re package is used for Regex Tokenizer
# We use the punctuation class in string package
# To convert data into feature vectors CountVectorizer is used
# The machine learning model is Logistic Regression
import numpy as np
import pandas as pd
import re               
import string           
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.linear_model import LogisticRegression         

In [5]:
# Importing train data
train = pd.read_csv('train.tsv', sep='\t')
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
# Splitting 90% data into Train data and remaining into Test data randomly 
sample     = np.random.choice(train.index, size = int(len(train)*0.9), replace = False)
train_data = train.iloc[sample]
test_data  = train.drop(sample)
print(train_data.shape)
print(test_data.shape)

(140454, 4)
(15606, 4)


In [7]:
# Defining RegEx Tokenizer to split our data into individual words
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [8]:
# Defining the vectorizer and using it to convert Train and Test data into feature vectors
count_vec = CountVectorizer(tokenizer=tokenize)
count_vec_train = count_vec.fit_transform(train_data['Phrase'])
count_vec_test = count_vec.transform(test_data['Phrase'])

In [11]:
# Training the Logistic Regression model on Train Data using the inbuilt sag solver
LogReg = LogisticRegression(max_iter = 10000, solver = 'sag').fit(count_vec_train, train_data['Sentiment'])

In [12]:
# Testing the accuracy of model using test data
test_prediction = LogReg.predict(count_vec_test)
acc = (test_prediction == test_data['Sentiment']).mean()
print("The accuracy of the model is : ", acc*100, "%")

The accuracy of the model is :  65.79520697167756 %


In [9]:
# Comparing other solvers
def compare(sol):
    LogReg = LogisticRegression(max_iter = 10000, solver = sol).fit(count_vec_train, train_data['Sentiment'])
    test_prediction = LogReg.predict(count_vec_test)
    acc = (test_prediction == test_data['Sentiment']).mean()
    print("The accuracy of "+sol+" solver is : ", acc*100, "%")

In [10]:
solvers = ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']
for s in solvers :
    compare(s)

The accuracy of liblinear solver is :  64.68025118544149 %
The accuracy of lbfgs solver is :  65.7759835960528 %
The accuracy of newton-cg solver is :  65.78879917980264 %
The accuracy of sag solver is :  65.79520697167756 %
The accuracy of saga solver is :  65.78879917980264 %
