<a href="https://colab.research.google.com/github/sruthi1014/Sentimental-analysis-of-Twitter-Data/blob/master/Twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import csv
import string
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
list_stop_words = set(stopwords.words('english'))

In [0]:
!pip install -q xlrd 
!git clone https://github.com/sruthi1014/Sentimental-analysis-of-Twitter-Data.git

In [0]:
file1= open(r"Sentimental-analysis-of-Twitter-Data/train.txt","rt", encoding="utf8")
train1=file1.read().lower()
file2=open(r"Sentimental-analysis-of-Twitter-Data/test.txt","rt", encoding="utf8")
test1=file2.read().lower()

#### Converting text to Dataframe

In [0]:
train=train1.splitlines()
y=[]
for i in train:
    y.append(i.split(",",3))
train_df=pd.DataFrame(y) 
train_df=train_df[[1,3]]
train_df[3]=train_df[3].str.lstrip()


In [0]:
test=test1.splitlines()
y=[]
for i in test:
    y.append(i.split(",",3))
test_df=pd.DataFrame(y) 
test_df=test_df[[1,3]]
test_df[3]=test_df[3].str.lstrip()

#### Data cleaning

In [0]:
# step a: function to remove punctuation/apostrophe
def removal(textfile):
    y=textfile[3]
    ps =SnowballStemmer("english")
    cleaned=[]
    for i in y:
        i.replace("'", " ")  #step 1: replacing apostraphe
        i=re.sub(r"\@\w+"," ",i)   #step2: removing words starting with @
        cleaned.append(i.translate(str.maketrans(string.punctuation,' '*len(string.punctuation)))) # step 3: remove_punctuation             
    
    # step4: removing stop words and perform stemming
    output=[]
    for i in cleaned:
        output.append(" ".join([ps.stem(w) for w in i.split()  if  w not in list_stop_words and w.isalpha()]))    

    return output

In [0]:
train_cleaned=pd.DataFrame(train_df[1])
train_cleaned.insert(1, 2, removal(train_df))
train_cleaned=train_cleaned.drop(train_cleaned.index[[0]]).reset_index(drop=True)

In [0]:
test_cleaned=pd.DataFrame(test_df[1])
test_cleaned.insert(1, 2, removal(test_df))
test_cleaned=test_cleaned.drop(test_cleaned.index[[0]]).reset_index(drop=True)

#### TF/IDF calculation

In [0]:
vectorizer=TfidfVectorizer(max_features=10000)#max_df=0.90, min_df=5)
tfidf_train = vectorizer.fit_transform(train_cleaned[2])
tfidf_test = vectorizer.transform(test_cleaned[2])

In [0]:

tfidf_train=pd.DataFrame(tfidf_train.todense(),columns=vectorizer.get_feature_names())
tfidf_test=pd.DataFrame(tfidf_test.todense(),columns=vectorizer.get_feature_names())

In [0]:
# Merge the sentiment column from the original training data and the calculated TF-IDF matrix
tfidf_train.insert(0,"target_var",pd.to_numeric(train_cleaned[1]))
tfidf_test.insert(0,"target_var",pd.to_numeric(test_cleaned[1]))

In [0]:
# Split training features and labels
traindata = tfidf_train.iloc[:, 1:]
trainlabel = pd.DataFrame(tfidf_train.iloc[:, 0])

# Split testing features and labels
testdata = tfidf_test.iloc[:, 1:]
testlabel = pd.DataFrame(tfidf_test.iloc[:, 0])

In [0]:
# Class for logistic regression
class LogisticRegression:
    def __init__(self):
        # Initialize learning rate and weights
        self.lr = 0.1
        self.theta = np.zeros(shape=(tfidf_train.shape[1]-1, 1))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
       
      
        # Iterate and learn the parameters
        for i in range(1000):
            z = np.dot(X, self.theta)
            y_pred = self.sigmoid(z)
            y_pred = y_pred.reshape(y.shape)
            gradient = np.dot(X.T, (y_pred - y)) / y.size
            self.theta -= self.lr * gradient


    # Get the probabilities for predictions
    def predict_prob(self, X):
       # intercept = np.ones((X.shape[0], 1))
        return self.sigmoid(np.dot(X, self.theta))

    # Determine the label based on probabilities
    def predict(self, X):
        predictedLabels = np.array([])
        pred_probs = self.predict_prob(X)

        for prob in pred_probs:
            # The avg probability for 0 and 1 in the training data is around 0.5 so we set the threshold for classification at this mean
            if prob >= 0.55:
                predictedLabels = np.append(predictedLabels, 1)
            else:
                predictedLabels = np.append(predictedLabels, 0)

        return predictedLabels

In [0]:
# Array for storing accuracy scores
scores = np.array([])

In [19]:
trainlabel.to_numpy().shape

(90000, 1)

In [0]:
# Initialize 10-fold cross validation
kf = KFold(n_splits=10)

# Iterate over folds and call the classifier
for train_index, test_index in kf.split(traindata):
    x_train, x_test = traindata.iloc[train_index], traindata.iloc[test_index]
    y_train, y_test = trainlabel.iloc[train_index], trainlabel.iloc[test_index]
    logReg = LogisticRegression()
    logReg.fit(x_train.to_numpy(), y_train.to_numpy())
    pred = logReg.predict(x_test)

    scores = np.append(scores, accuracy_score(y_test, pred))

print("Accuracy over 10-fold cross validations are: ", scores)
print("Mean accuracy: ", scores.mean())

In [24]:
log=LogisticRegression()
test_predicted = log.predict(testdata)
#Accuracy on test data
accuracy_test = accuracy_score(testlabel, test_predicted)
accuracy_test

0.5994

In [26]:
# Create confusion matrix
confusionMatrix = pd.DataFrame(data = confusion_matrix(testlabel, (test_predicted >= 0.5) .astype(int)), columns=["0", "1"], index = ["0", "1"])
print(confusionMatrix)

   0     1
0  0  4006
1  0  5994


In [27]:
# Precision = TP/(TP + FP)
precision = round((confusionMatrix.iloc[1, 1] / (confusionMatrix.iloc[1, 1] + confusionMatrix.iloc[0, 1])) * 100, 2)
print("Precision: ", precision)

Precision:  59.94


In [28]:
# Recall = TP/(TP + FN)
recall = round((confusionMatrix.iloc[1, 1] / (confusionMatrix.iloc[1, 1] + confusionMatrix.iloc[1, 0])) * 100, 2)
print("Recall: ", recall)

Recall:  100.0
