<a href="https://colab.research.google.com/github/SattuSupCodes/SentimentAnalysisAI/blob/main/sentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [104]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [105]:
# importing dataset
df = pd.read_csv("sentiment.csv", encoding="latin1")

df = df[['text', 'sentiment']]
 #dropping themm null itemss
df = df.dropna(subset=['text', 'sentiment'])

In [106]:
print("dataset shape:" , df.shape)
print("\nsentiment distribution:\n", df['sentiment'].value_counts())

dataset shape: (3534, 2)

sentiment distribution:
 sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64


In [107]:
#cleaning time
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove urls
    text = re.sub(r"@\w+", "", text)            # remove @ mentions
    text = re.sub(r"#[A-Za-z0-9_]+", "", text)  # remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)     # remove emojis & punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text
df["clean_text"] = df["text"].apply(clean_text)



In [108]:
x = df["clean_text"]
y = df["sentiment"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.transform(y_test) #we are tryna make things harder for us

In [109]:
num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("Encoded Labels (Train):", y_train_int)

Classes: ['negative' 'neutral' 'positive']
Encoded Labels (Train): [0 0 1 ... 0 1 2]


In [110]:
vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
x_test_vec = vectorizer.fit_transform(x_test)
x_train_vec = vectorizer.transform(x_train)
x_train_dense = x_train_vec.toarray()
x_test_dense = x_test_vec.toarray()

num_samples, num_features = x_train_dense.shape
print("Train shape:", x_train_dense.shape)
print("Test shape:", x_test_dense.shape)

Train shape: (2827, 6000)
Test shape: (707, 6000)


In [119]:
def to_one_hot(y_int, num_classes):
  N = len(y_int)
  one_hot = np.zeros((N, num_classes))
  one_hot[np.arange(N), y_int] = 1
  return one_hot
y_train_onehot = to_one_hot(y_train_int, num_classes)
y_test_onehot = to_one_hot(y_test_int, num_classes)

print("Train shape:", y_train_onehot.shape)
print("Test shape:", y_test_onehot.shape)

Train shape: (2827, 3)
Test shape: (707, 3)


one hot basically turns your positive, negative, neutral into matrices

negative = [1 0 0],
neutral = [0 1 0],
positive = [ 0 0 1]

In [112]:
#lets initialise weights and use softmax and feel a bit more engineer smh smh

np.random.seed(42)
W = np.random.randn(num_features, num_classes) * 0.01
b = np.zeros((1, num_classes))

In [113]:
def softmax(logits):
  exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
  return exps / np.sum(exps, axis=1, keepdims=True)

In [103]:
from scipy.special import log_softmax
#model = LogisticRegression(multi_class='multinomial', max_iter=1000)
#model.fit(x_train_vec, y_train)

#letus open the .fit() cuz why not

learning_rate = 0.1
epochs = 100

for epoch in range(epochs):
  logits = np.dot(x_train_dense, W) + b
  probs = softmax(logits)

  epsilon = 1e-15 #to avoid log0
  loss = -np.mean(np.sum(y_train_onehot * np.log(probs + epsilon), axis=1))
  error = (probs - y_train_onehot) / num_samples
  dW = np.dot(x_train_dense.T, error)
  db = np.sum(error, axis=0, keepdims=True)

  #updating weights

  W -= learning_rate * dW
  b -= learning_rate * db

  if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}")

Epoch 1/100 - Loss: 1.0985
Epoch 10/100 - Loss: 1.0922
Epoch 20/100 - Loss: 1.0882
Epoch 30/100 - Loss: 1.0858
Epoch 40/100 - Loss: 1.0842
Epoch 50/100 - Loss: 1.0830
Epoch 60/100 - Loss: 1.0820
Epoch 70/100 - Loss: 1.0811
Epoch 80/100 - Loss: 1.0802
Epoch 90/100 - Loss: 1.0793
Epoch 100/100 - Loss: 1.0785


epoch is the training session and logits is the AI thinking in a nutshell then giving output based on highest probability

In [114]:
# Forward on test data
logits_test = np.dot(x_test_dense, W) + b
y_test_pred_proba = softmax(logits_test)

# Class with highest probability
y_test_pred_int = np.argmax(y_test_pred_proba, axis=1)

# Compare with true labels
from sklearn.metrics import accuracy_score, classification_report

print("Scratch Logistic Regression Accuracy:",
      accuracy_score(y_test_int, y_test_pred_int))

print("\nClassification Report:\n",
      classification_report(y_test_int, y_test_pred_int,
                            target_names=label_encoder.classes_))


Scratch Logistic Regression Accuracy: 0.33663366336633666

Classification Report:
               precision    recall  f1-score   support

    negative       0.28      0.27      0.27       200
     neutral       0.40      0.37      0.38       286
    positive       0.31      0.36      0.33       221

    accuracy                           0.34       707
   macro avg       0.33      0.33      0.33       707
weighted avg       0.34      0.34      0.34       707



In [117]:
def predict_sentiment(text):
  text = clean_text(text)
  vec = vectorizer.transform([text]).toarray()
  logits = np.dot(vec, W) + b
  proba = softmax(logits)[0]
  pred_int = np.argmax(proba)
  label = label_encoder.inverse_transform([pred_int])[0]
  return label, dict(zip(label_encoder.classes_,proba))

In [118]:
print("\nExamples:")
print(predict_sentiment("I am so happy today!!"))
print(predict_sentiment("This is the worst thing ever"))
print(predict_sentiment("It's okay, nothing special"))


Examples:
('positive', {'negative': np.float64(0.33028875981735883), 'neutral': np.float64(0.3310224035801897), 'positive': np.float64(0.3386888366024514)})
('negative', {'negative': np.float64(0.3360399123607244), 'neutral': np.float64(0.3308234557644451), 'positive': np.float64(0.33313663187483045)})
('neutral', {'negative': np.float64(0.32781830186693683), 'neutral': np.float64(0.33720446966698764), 'positive': np.float64(0.33497722846607547)})


lets learn as we go

TF-IDF = Term frequency x inverse document frequency

TF: How frequently does this word appear in this tweet?
IDF: How rare is this word in whole dataset?