# Sentiment Analysis & Detection on X(former twitter)

    Author: Nithusikan T.
    Email: tnithusikan@gmail.com
    Date: 19/11/2025

### 1. Imports

In [27]:
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

### 2. Data Ingestion

In [13]:
df = pd.read_csv('https://raw.githubusercontent.com/entbappy/Branching-tutorial/refs/heads/master/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [14]:
# delete tweet id
df.drop(columns=['tweet_id'],inplace=True)
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [15]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [16]:
# Create a LabelEncoder instance
le = LabelEncoder()

# Fit and transform the 'Sentiment' column
df['encoded_sentiment'] = le.fit_transform(df['sentiment'])

# Unique labels
print("\nUnique classes learned by the encoder:")
print(le.classes_)

# Inverse transform to get original labels back
original_sentiments = le.inverse_transform([i for i in range(len(df['encoded_sentiment'].unique()))])
print("\nInverse transformed labels:")
print(original_sentiments)


Unique classes learned by the encoder:
['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']

Inverse transformed labels:
['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']


In [17]:
# Drop "sentiment" column
final_df = df.drop(columns=["sentiment"])
final_df.head()

Unnamed: 0,content,encoded_sentiment
0,@tiffanylue i know i was listenin to bad habi...,2
1,Layin n bed with a headache ughhhh...waitin o...,10
2,Funeral ceremony...gloomy friday...,10
3,wants to hang out with friends SOON!,3
4,@dannycastillo We want to trade with someone w...,8


In [18]:
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=42)

### 3. Data Preprocessing

In [19]:
nltk.download('wordnet')
nltk.download('stopwords')

def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

def removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text

def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)

def removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def normalize_text(df):
    df.content=df.content.apply(lambda content : lower_case(content))
    df.content=df.content.apply(lambda content : remove_stop_words(content))
    df.content=df.content.apply(lambda content : removing_numbers(content))
    df.content=df.content.apply(lambda content : removing_punctuations(content))
    df.content=df.content.apply(lambda content : removing_urls(content))
    df.content=df.content.apply(lambda content : lemmatization(content))
    return df

def normalized_sentence(sentence):
    sentence= lower_case(sentence)
    sentence= remove_stop_words(sentence)
    sentence= removing_numbers(sentence)
    sentence= removing_punctuations(sentence)
    sentence= removing_urls(sentence)
    sentence= lemmatization(sentence)
    return sentence

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [21]:
train_data = normalize_text(train_data)
test_data = normalize_text(test_data)

In [22]:
train_data.head()

Unnamed: 0,content,encoded_sentiment
14307,thundershower plus baseball equal awwww,8
17812,hangover movie gonna hilarious wish could see ...,4
11020,playing game leave work work til least pm tonight,12
15158,stick work till freakin am madd suck worked day,6
24990,sarah x atl u mean jack barakat s wow u ever g...,11


### 4. Feature Engineering

In [23]:
X_train = train_data['content'].values
y_train = train_data['encoded_sentiment'].values

X_test = test_data['content'].values
y_test = test_data['encoded_sentiment'].values

In [24]:
# Apply TF-IDF (TfidfVectorizer)
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = vectorizer.transform(X_test)

In [25]:
train_df = pd.DataFrame(X_train_tfidf.toarray())

train_df['label'] = y_train
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37260,37261,37262,37263,37264,37265,37266,37267,37268,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11


In [None]:
test_df = pd.DataFrame(X_test_tfidf.toarray())

test_df['label'] = y_test

### 5. Model Building

In [28]:
# Define and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_tfidf)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [29]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.343125
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.08      0.03      0.05        31
           2       0.10      0.01      0.01       162
           3       0.00      0.00      0.00       163
           4       0.18      0.03      0.05       338
           5       0.34      0.30      0.32      1028
           6       0.37      0.20      0.26       268
           7       0.48      0.40      0.44       762
           8       0.32      0.66      0.44      1740
           9       0.24      0.03      0.06       352
          10       0.39      0.24      0.29      1046
          11       0.23      0.04      0.06       425
          12       0.33      0.38      0.35      1666

    accuracy                           0.34      8000
   macro avg       0.24      0.18      0.18      8000
weighted avg       0.32      0.34      0.31      8000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### 6. Model Evaluation

In [32]:
# Make predictions
y_pred = xgb_model.predict(X_test_tfidf)
y_pred_proba = xgb_model.predict_proba(X_test_tfidf)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [33]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"AUC: {auc}")

Precision: 0.32491507606948494
Recall: 0.343125
AUC: 0.673260784361834
