#### Importing the necessary libraries

In [1]:
import re
import csv
import string
import nltk
import pandas as pd
import numpy as np
from numpy import array
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

#### Reading the files

In [2]:
train = pd.read_csv(r"C:\Users\sruth\Desktop\SEM 2\Text Analytics\train.csv")
test = pd.read_csv(r"C:\Users\sruth\Desktop\SEM 2\Text Analytics\test.csv")

In [3]:
train.shape
test.shape

(10000, 4)

#### Stopwords list

In [4]:
list_stop_words = set(stopwords.words('english'))

#### Data Cleaning function - Converted to lower case, removed white spaces, punctuations, special characters and stopwords

In [5]:
def dataclean (sentimenttext):
    # Convert to lower case
    sentimenttext = sentimenttext.lower()

    # Remove www.* or https?://*
    sentimenttext = re.sub('((www\.[^\s]+)|(https?://[^\s]+))|(http?://[^\s]+)', '', sentimenttext)

    # Remove additional white spaces
    sentimenttext = re.sub('[\s]+', ' ', sentimenttext)

    # Replace #word with word and remove @word 
    sentimenttext = re.sub(r'#([^\s]+)', r'\1', sentimenttext)
    sentimenttext = re.sub(r'@([^\s]+)', '', sentimenttext) 

    # Remove special characters, punctuations and replacing apostrophe
    sentimenttext = re.sub('&[a-z]+;', '', sentimenttext)
    sentimenttext = re.sub(r'\d+', '', sentimenttext)
    sentimenttext = sentimenttext.translate(str.maketrans(string.punctuation,' '*len(string.punctuation)))
    sentimenttext = sentimenttext.replace("'", "")

    # Remove stopwords and trim
    sentimenttext = " ".join([word for word in sentimenttext.split(" ") if word not in list_stop_words])
    sentimenttext = sentimenttext.strip()

    return sentimenttext

#### Data Cleaning on train and test data

In [6]:
train["SentimentText"] = train["SentimentText"].apply(dataclean)
test["SentimentText"] = test["SentimentText"].apply(dataclean)

In [7]:
#Dropped the unnecessary columns
train = train.drop(columns=["SentimentSource"])
train = train.drop(columns=["ItemID"])

test = test.drop(columns=["SentimentSource"])
test = test.drop(columns=["ItemID"])

train_label = train["Sentiment"]
test_label = test["Sentiment"]

#### TF-IDF matrix

In [8]:
# Create a TF-IDF matrix using TfidfVectorizer. 
# On local machine we took 3000 features but on google colab we took 10000 features and ran the code

vectorizer = TfidfVectorizer(max_features = 3000)
tfidf_train = vectorizer.fit_transform(train['SentimentText']).toarray()
tfidf_test = vectorizer.transform(test['SentimentText']).toarray()

#### Logistic Regression

In [9]:
#Logistic Regression 

#sigmoid function
def sigmoid(X, weight):
    z = np.dot(X, weight)
    return 1 / (1 + np.exp(-z))

#calculating the gradient
def gradient_descent(X, h, y):
    return np.dot(X.T, (h - y)) / y.shape[0]

#updating the weights which is weight - learning rate multiplied by gradient
def updated_weight(weight, learning_rate, gradient):
    return weight - learning_rate * gradient

def predict(x, theta):
    theta_new = theta[:, np.newaxis]
    return sigmoid(x,theta_new)


# Iterate and learn the parameters
def gradient(X, y):
    num_iter = 100
    theta = np.zeros(X.shape[1])
    for i in range(num_iter):
        h = sigmoid(X, theta)
        gradient = gradient_descent(X, h, y)
        theta = updated_weight(theta, 0.1, gradient)
    return theta


# The avg probability for 0 and 1 in the training data is around 0.5 so we set the threshold for classification at this mean.          
def accuracy_score(actual, pred):
    predicted_class = ((pred >= 0.5) .astype(int))
    predicted_class = predicted_class.flatten()
    acc = np.mean(predicted_class == actual)
    return acc

#### Cross Validation

In [10]:
# Initialize 10-fold cross validation
kfold = KFold(10, True, 1)
bestaccuracy = 0
scores = np.array([])
theta_final = np.zeros(tfidf_train.shape[1])

#Splitting the train data to train and validation and Iterating over folds.
for train, test in kfold.split(tfidf_train):
    X_train = tfidf_train[train]
    X_test = tfidf_train[test]    
    Y_train = train_label[train]
    Y_test = train_label[test]


    theta_out = gradient(X_train, Y_train)
    pred = predict(X_test, theta_out)
    acc_score = accuracy_score(Y_test, pred)

#Accuracy for each fold
    scores = np.append(scores, acc_score)
    
    if(acc_score > bestaccuracy):
        theta_final = theta_out
        bestaccuracy = acc_score

print("Accuracy over 10-fold cross validations are: ", scores)
print("Mean accuracy: ", scores.mean())

Accuracy over 10-fold cross validations are:  [0.70466667 0.70622222 0.69888889 0.70077778 0.69744444 0.69566667
 0.69777778 0.69277778 0.7        0.70255556]
Mean accuracy:  0.6996777777777778


####  predicting on test data

In [11]:
test_predicted = predict(tfidf_test, theta_final)

#Accuracy on test data
accuracy_test = accuracy_score(test_label, test_predicted)
accuracy_test

0.7094

#### Performance (precision, recall) of the model on  test dataset.
#### Confusion Matrix

In [12]:
# Create confusion matrix
confusionMatrix = pd.DataFrame(data = confusion_matrix(test_label, (test_predicted >= 0.5) .astype(int)), columns=["0", "1"], index = ["0", "1"])
print(confusionMatrix)

      0     1
0  1763  2243
1   663  5331


#### Precision value

In [13]:
# Precision = TP/(TP + FP)
precision = round((confusionMatrix.iloc[1, 1] / (confusionMatrix.iloc[1, 1] + confusionMatrix.iloc[0, 1])) * 100, 2)
print("Precision: ", precision)

Precision:  70.39


#### Recall value

In [14]:
# Recall = TP/(TP + FN)
recall = round((confusionMatrix.iloc[1, 1] / (confusionMatrix.iloc[1, 1] + confusionMatrix.iloc[1, 0])) * 100, 2)
print("Recall: ", recall)

Recall:  88.94
