# Sentiment Analysis Using Multinomial Logistic Regression

## Task 1: Import Libraries

In [3]:
import string
import numpy as np 
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

## Task 2: Load the Dataset

In [4]:
df = pd.read_csv("/usercode/Tweets.csv")
df.head()

Unnamed: 0,sentiment,tweet
0,neutral,Happy Monday tweeples... hope it wasn`t to har...
1,negative,I can`t believe I thought I had a morning shif...
2,neutral,So its Superstar Sunday? is one superstar I k...
3,positive,yay i hit 50 subscribers on youtube. go me lol.
4,neutral,Today = marking *135


## Task 3: Remove Punctuation from Tweets

In [10]:
def remove_punc(s):
    s2 = ''
    for c in s:
        if c not in string.punctuation:
            s2 += c
    return s2

df.iloc[:,1] = df.iloc[:,1].apply(remove_punc)
print(df.head())
    

  sentiment                                              tweet
0   neutral  Happy Monday tweeples hope it wasnt to hard to...
1  negative  I cant believe I thought I had a morning shift...
2   neutral  So its Superstar Sunday  is one superstar I kn...
3  positive     yay i hit 50 subscribers on youtube go me  lol
4   neutral                                 Today  marking 135


## Task 4: Split Tweets into a Bag of Words

In [11]:
df['tweet'] = df['tweet'].apply(lambda x:x.split())
df.head()

Unnamed: 0,sentiment,tweet
0,neutral,"[Happy, Monday, tweeples, hope, it, wasnt, to,..."
1,negative,"[I, cant, believe, I, thought, I, had, a, morn..."
2,neutral,"[So, its, Superstar, Sunday, is, one, supersta..."
3,positive,"[yay, i, hit, 50, subscribers, on, youtube, go..."
4,neutral,"[Today, marking, 135]"


## Task 5: Create a Vocabulary and Remove Stop Words

In [14]:
vocabulary_dict = {}
for row in df['tweet']:
    for word in row:
        if word.lower() in vocabulary_dict:
            vocabulary_dict[word.lower()] += 1
        else:
            vocabulary_dict[word.lower()] = 1
vocabulary_dict = sorted(vocabulary_dict.items(), key = lambda x:x[1], reverse=True)
vocabulary_dict = vocabulary_dict[100:]
vocabulary_dict = dict(vocabulary_dict)

vocabulary_dict = list(vocabulary_dict.keys())

## Task 6: Create Feature Vectors

In [16]:
# Extract tweets and convert to lowercase
tweets = [' '.join([word.lower() for word in tweet]) for tweet in df['tweet']]

# Create the CountVectorizer with the vocabulary
vectorizer = CountVectorizer(vocabulary=vocabulary_dict)

# Fit and transform the tweets into feature vectors
tweet_vectors = vectorizer.fit_transform(tweets)

# Convert tweet_vectors to NumPy array
X = tweet_vectors.toarray()

# Print the NumPy array
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


## Task 7: Map and Extract the Sentiment Column

In [17]:
mapping = {
    'positive' : 2, 'neutral' : 1, 'negative' : 0
}
df['sentiment'] = df['sentiment'].map(mapping)
y = df['sentiment'].values

## Task 8: Split the Dataset into Training and Test Sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Task 9: Define the Weights Initialization Function

In [21]:
def initialze_weights(n_features, n_classes):
    return np.zeros([n_features, n_classes])


## Task 10: Define One-Hot Encoding Function

In [33]:
def one_hot_encode(y, n_classes):
    y_encoded = np.zeros([len(y), n_classes])
    for i in range(len(y)):
        y_encoded[i, y[i]] = 1
    return y_encoded



## Task 11: Define the Softmax Function

In [23]:
def softmax_func(x):
    return np.exp(x)/np.sum(np.exp(x), axis = 1, keepdims=True)

## Task 12: Define the Gradient Descent Function

In [40]:
def grad_descent(X, y_encoded, weights, bias, learning_rate):
    scores = np.dot(X, weights) + bias
    probabilities = softmax_func(scores)
    error = np.abs(probabilities - y_encoded)
    dw = np.dot(X.T, error)/len(X)
    db = np.sum(error, axis = 0)/len(X)
    weights -= learning_rate * dw
    bias  -= learning_rate * db
    return weights, bias

## Task 13: Define the Training Function

In [38]:
def train_multinomial_logisitc_regression(X, y, learning_rate, max_iters):
    n_samples, n_features = np.shape(X)
    n_classes = len(np.unique(y))
    
    weights = initialze_weights(n_features, n_classes)
    bias = np.zeros(n_classes)
    y_encoded = one_hot_encode(y, n_classes)
    
    for i in range(max_iterations):
        weights, bias = grad_descent(X, y_encoded, weights, bias, learning_rate)
    
    return weights, bias

## Task 14: Define the Prediction Function

In [29]:
def predict(X, weights, bias):
    scores = np.dot(X, weights) + bias
    probabilities = softmax_func(scores)
    return np.argmax(probabilities, axis = 1)

## Task 15: Train the Model

In [41]:
learning_rate = 0.31101
max_iterations = 100
weights, bias = train_multinomial_logisitc_regression(X_train, y_train, learning_rate, max_iters=max_iterations)

## Task 16: Test the Model

In [None]:
y_pred = predict(X_test, weights, bias)
print(y_pred)

## Task 17: Generate the Confusion Matrix and Classification Report

In [None]:
# Generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
labels = ["Positive", "Negative", "Neutral"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot()
plt.show()

## Generate a classification report
print(classification_report(y_test, y_pred, target_names=['Positive', 'Negative', 'Neutral']))