In [None]:
import pandas as pd #handling data
import numpy as np #number operations
import seaborn as sns #plotting
import matplotlib.pyplot as plt #plotting
import re #regex for cleaning text
import nltk #stopwords
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample 

#loading the dataset
df = pd.read_csv(r"C:\Users\kausi\Downloads\archive\tweet_emotions.csv")
print(df.head())
print(df.columns)

#renaming columns
df = df.rename(columns={"content": "text",  "sentiment": "emotion"})

# train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['emotion'])
print(train_df.shape, test_df.shape)
train_df.head()

#download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#text cleaning function
def clean_text(text):
    text = text.lower() #normalize case
    text = re.sub(r"http\S+|www\S+|https\S+", '', text) #remove URLs
    text = re.sub(r"[^a-zA-Z\s]", '', text)  #remove emojis, symbols, punctuation            
    text = ' '.join([word for word in text.split() if word not in stop_words]) #remove stopwords
    return text

#apply cleaning to train & test data
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

#Convert text to numeric features (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df["clean_text"])
X_test = vectorizer.transform(test_df["clean_text"])

#Labels
y_train = train_df["emotion"]
y_test = test_df["emotion"]

#Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#Make prediction and evaluate 
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#Function to predict emotion of a new sentence
def predict_emotion(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return prediction

print(predict_emotion("I am so happy today!"))
print(predict_emotion("This is making me angry."))
print(predict_emotion("I am feeling lonely."))

#Attempt at balancing dataset
majority = df[df.emotion == "happiness"]
minority = df[df.emotion != "happiness"]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
df_balanced = pd.concat([majority, minority_upsampled])

     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...
Index(['tweet_id', 'sentiment', 'content'], dtype='object')
(32000, 3) (8000, 3)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kausi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.347625

Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        22
     boredom       1.00      0.03      0.05        36
       empty       0.00      0.00      0.00       165
  enthusiasm       0.00      0.00      0.00       152
         fun       0.25      0.03      0.06       355
   happiness       0.33      0.35      0.34      1042
        hate       0.41      0.10      0.16       265
        love       0.53      0.39      0.45       768
     neutral       0.33      0.56      0.41      1728
      relief       0.26      0.03      0.05       305
     sadness       0.36      0.24      0.29      1033
    surprise       0.22      0.03      0.06       437
       worry       0.35      0.49      0.41      1692

    accuracy                           0.35      8000
   macro avg       0.31      0.17      0.17      8000
weighted avg       0.33      0.35      0.31      8000

happiness
sadness
sadness


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
