In [8]:
import pandas as pd
import re

In [7]:
df = pd.read_csv('tweet_emotions.csv')

In [9]:
print(df.head())

     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...


In [10]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

df['clean_content'] = df['content'].apply(clean_text)

print("Sample cleaned content:")
for i in range(5):
    print(f"Original: {df['content'][i]}")
    print(f"Cleaned: {df['clean_content'][i]}")
    print()

Sample cleaned content:
Original: @tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[
Cleaned: tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part

Original: Layin n bed with a headache  ughhhh...waitin on your call...
Cleaned: layin n bed with a headache ughhhhwaitin on your call

Original: Funeral ceremony...gloomy friday...
Cleaned: funeral ceremonygloomy friday

Original: wants to hang out with friends SOON!
Cleaned: wants to hang out with friends soon

Original: @dannycastillo We want to trade with someone who has Houston tickets, but no one will.
Cleaned: dannycastillo we want to trade with someone who has houston tickets but no one will



In [12]:
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values)

df = df.dropna().reset_index(drop=True)
print("\nDataset shape after dropping missing values:", df.shape)

Missing values:
tweet_id         0
sentiment        0
content          0
clean_content    0
dtype: int64

Dataset shape after dropping missing values: (40000, 4)


In [28]:
df['clean_content'] = df['content'].apply(lambda x: clean_text(x))
df = df.dropna().reset_index(drop=True)
X = df['clean_content']
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_tfidf_dense = X_tfidf.toarray()
print("Shape of TF-IDF matrix:", X_tfidf_dense.shape)

Shape of TF-IDF matrix: (40000, 5000)


In [51]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

df['clean_content'] = df['content'].apply(clean_text)
df = df.dropna().reset_index(drop=True)

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = df['clean_content']
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_tfidf_dense = X_tfidf.toarray()


In [53]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_dense, df['sentiment'], test_size=0.2, random_state=42)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
print("Naive Bayes Classifier Performance:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
cv_scores = cross_val_score(nb_classifier, X_tfidf_dense, df['sentiment'], cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Naive Bayes Classifier Performance:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.35      0.30      0.32      1028
        hate       0.43      0.01      0.02       268
        love       0.51      0.30      0.38       762
     neutral       0.30      0.56      0.39      1740
      relief       0.00      0.00      0.00       352
     sadness       0.39      0.14      0.21      1046
    surprise       1.00      0.00      0.01       425
       worry       0.30      0.55      0.39      1666

    accuracy                           0.32      8000
   macro avg       0.25      0.14      0.13      8000
weighted avg       0.34      0.32      0.27      8000

Accuracy: 0.323
Cross-validation scores: [0.2785   0.303625 0.34525  0.348125 0

In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['sentiment'])
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_tfidf_dense, y_encoded, test_size=0.2, random_state=42)

# Define deep learning model
model = Sequential([
    Dense(128, input_dim=X_train_dl.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_dl, y_train_dl, epochs=10, batch_size=32, validation_data=(X_test_dl, y_test_dl))
loss, accuracy = model.evaluate(X_test_dl, y_test_dl)
print("Deep Learning Model Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Model Accuracy: 0.3179999887943268
