In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [5]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('/kaggle/input/social-media-sentiments-analysis-dataset/sentimentdataset.csv')

# Display basic information and first few rows
print(df.info())
print(df.head())

# Focus on the relevant columns (Text and Sentiment)
df = df[['Text', 'Sentiment']]

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Drop rows with missing values (if any)
df.dropna(inplace=True)


# Text Cleaning Functions
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply text cleaning
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Check the cleaned output
print(df.head())

# Analyze the cleaned data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB
None
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2  

In [6]:
df['Sentiment'] = df['Sentiment'].str.strip()
df['Sentiment'] = df['Sentiment'].str.lower()
replace_dict = {
    'positive': 'happy', 'joy': 'happy', 'serenity': 'happy', 'euphoria': 'happy',
    'elation': 'happy', 'happiness': 'happy', 'playful': 'happy', 'amusement': 'happy',
    'despair': 'sad', 'grief': 'sad', 'regret': 'sad', 'melancholy': 'sad',
    'negative': 'sad', 'bad': 'sad', 'loneliness': 'sad', 'desolation': 'sad',
    'excitement': 'excited', 'thrill': 'excited', 'adventure': 'excited',
    'enthusiasm': 'excited', 'inspired': 'excited', 'inspiration': 'excited', 'arousal': 'excited',
    'hate': 'angry', 'disgust': 'angry', 'bitterness': 'angry', 'betrayal': 'angry',
    'frustration': 'angry', 'frustrated': 'angry', 'anger': 'angry',
    'pride': 'proud', 'admiration': 'proud', 'awe': 'proud', 'reverence': 'proud',
    'contentment': 'content', 'acceptance': 'content', 'fulfillment': 'content',
    'calmness': 'content', 'satisfaction': 'content',
    'indifference': 'neutral', 'numbness': 'neutral', 'ambivalence': 'neutral',
    'hope': 'hopeful', 'determination': 'hopeful', 'resilience': 'hopeful', 'empowerment': 'hopeful',
    'shame': 'embarassed', 'embarassment': 'embarassed',
    'gratitude': 'grateful',
    'compassionate': 'compassion', 'tenderness': 'compassion', 'empathetic': 'compassion'
}

df['Sentiment'] = df['Sentiment'].replace(replace_dict)


In [7]:
print("Unique Sentiments:", df['Sentiment'].value_counts())
print("Sample Cleaned Text:", df['Cleaned_Text'].head())

Unique Sentiments: Sentiment
happy             146
excited            66
sad                64
content            38
neutral            36
                 ... 
emotionalstorm      1
suffering           1
marvel              1
spark               1
freedom             1
Name: count, Length: 144, dtype: int64
Sample Cleaned Text: 0          enjoying a beautiful day at the park
1             traffic was terrible this morning
2              just finished an amazing workout
3    excited about the upcoming weekend getaway
4    trying out a new recipe for dinner tonight
Name: Cleaned_Text, dtype: object


In [None]:
import pandas as pd
from sklearn.utils import resample

# Combine rare classes (less than 10 samples) into 'Other'
rare_threshold = 10
sentiment_counts = df['Sentiment'].value_counts()

# # Map rare classes to 'Other'
rare_classes = sentiment_counts[sentiment_counts < rare_threshold].index
df['Sentiment'] = df['Sentiment'].apply(lambda x: 'Other' if x in rare_classes else x)

print("Unique Sentiments:", df['Sentiment'].value_counts())
# print("Balanced dataset created and saved.")


Unique Sentiments: Sentiment
Other         228
happy         146
excited        66
sad            64
content        38
neutral        36
angry          34
proud          28
hopeful        28
grateful       22
curiosity      16
compassion     15
nostalgia      11
Name: count, dtype: int64


In [10]:
label_encoder = LabelEncoder()
df['Sentiment_encoded'] = label_encoder.fit_transform(df['Sentiment'])

# Display label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'Other': 0, 'angry': 1, 'compassion': 2, 'content': 3, 'curiosity': 4, 'excited': 5, 'grateful': 6, 'happy': 7, 'hopeful': 8, 'neutral': 9, 'nostalgia': 10, 'proud': 11, 'sad': 12}


In [11]:
# Drop rows with missing text or sentiment
df.dropna(subset=['Text', 'Sentiment'], inplace=True)

# Verify the dataset
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Text               732 non-null    object
 1   Sentiment          732 non-null    object
 2   Cleaned_Text       732 non-null    object
 3   Sentiment_encoded  732 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 23.0+ KB
None
                                                Text Sentiment  \
0   Enjoying a beautiful day at the park!        ...     happy   
1   Traffic was terrible this morning.           ...       sad   
2   Just finished an amazing workout! 💪          ...     happy   
3   Excited about the upcoming weekend getaway!  ...     happy   
4   Trying out a new recipe for dinner tonight.  ...   neutral   

                                 Cleaned_Text  Sentiment_encoded  
0        enjoying a beautiful day at the park                  7  
1           traffic was ter

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
# Check for null values
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['Cleaned_Text'],            # Input features (cleaned text)
    df['Sentiment_encoded'],       # Target (encoded sentiment)
    test_size=0.2,                 # 80/20 split
    random_state=42,               # Reproducibility
    stratify=df['Sentiment_encoded'] # Maintain class balance
)

print(f"Training Samples: {len(X_train)}, Testing Samples: {len(X_test)}")
#Initialize TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=50000,          # Allow up to 10k words for richer features
    stop_words='english',         # Remove common stop words
    ngram_range=(1, 2),           # Capture both unigrams and bigrams
    min_df=2,                     # Ignore words that appear in less than 2 documents
    max_df=0.85,                  # Ignore words that appear in more than 85% of documents
    sublinear_tf=True             # Apply logarithmic frequency scaling
)

# Transform the dataset
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Shape of TF-IDF (Train):", X_train_tfidf.shape)
print("Shape of TF-IDF (Test):", X_test_tfidf.shape)






Training Samples: 585, Testing Samples: 147


In [41]:
# Preview TF-IDF matrices and encoded labels
print(X_train_tfidf[:3].toarray())
print(y_train[:3])

# Inspect the TF-IDF feature names (optional)
print(tfidf_vectorizer.get_feature_names_out()[:10])


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
284    2
386    6
396    0
Name: Sentiment_encoded, dtype: int64
['abstract' 'abstract art' 'abyss' 'academic' 'acceptance' 'accidentally'
 'accomplished' 'accomplishment' 'achieve' 'achieved']


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define the improved MLP model
mlp_model = Sequential()

# Input layer with Batch Normalization
mlp_model.add(Dense(2048, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
mlp_model.add(BatchNormalization())
mlp_model.add(Dropout(0.4))

# Additional Hidden layers
mlp_model.add(Dense(1024, activation='relu'))
mlp_model.add(BatchNormalization())
mlp_model.add(Dropout(0.4))

mlp_model.add(Dense(512, activation='relu'))
mlp_model.add(BatchNormalization())
mlp_model.add(Dropout(0.3))

mlp_model.add(Dense(256, activation='relu'))
mlp_model.add(BatchNormalization())
mlp_model.add(Dropout(0.3))

# Output layer
num_classes = len(np.unique(y_train))
mlp_model.add(Dense(num_classes, activation='softmax'))

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

# Compile the model
mlp_model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

mlp_model.summary()

# Train the model
history = mlp_model.fit(X_train_tfidf, y_train,
                        validation_data=(X_test_tfidf, y_test),
                        epochs=200,
                        batch_size=64)


Epoch 1/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 101ms/step - accuracy: 0.0913 - loss: 3.3657 - val_accuracy: 0.3197 - val_loss: 2.4876
Epoch 2/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.3981 - loss: 1.9156 - val_accuracy: 0.3197 - val_loss: 2.4232
Epoch 3/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - accuracy: 0.6924 - loss: 1.0216 - val_accuracy: 0.3129 - val_loss: 2.3488
Epoch 4/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.8542 - loss: 0.5057 - val_accuracy: 0.3265 - val_loss: 2.2851
Epoch 5/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - accuracy: 0.9361 - loss: 0.3045 - val_accuracy: 0.3265 - val_loss: 2.2406
Epoch 6/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - accuracy: 0.9450 - loss: 0.2267 - val_accuracy: 0.3197 - val_loss: 2.1996
Epoch 7/200
[1m10/10[0m 

In [35]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = mlp_model.fit(
    X_train_tfidf, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.2066 - loss: 2.5073 - val_accuracy: 0.3846 - val_loss: 2.1914
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3096 - loss: 2.1652 - val_accuracy: 0.3761 - val_loss: 2.0058
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3307 - loss: 1.9857 - val_accuracy: 0.3846 - val_loss: 1.9574
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4027 - loss: 1.8064 - val_accuracy: 0.4701 - val_loss: 1.8581
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.5381 - loss: 1.3840 - val_accuracy: 0.4786 - val_loss: 1.7581
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6397 - loss: 1.1507 - val_accuracy: 0.5385 - val_loss: 1.6294
Epoch 7/50
[1m15/15[0m [32m━━━━

In [50]:
# Evaluate on the test set
y_pred = np.argmax(mlp_model.predict(X_test_tfidf), axis=1)

# Performance metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Accuracy: 0.5850340136054422

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.63      0.60        46
           1       0.57      0.57      0.57         7
           2       1.00      0.67      0.80         3
           3       1.00      0.50      0.67         8
           4       1.00      0.33      0.50         3
           5       0.75      0.69      0.72        13
           6       1.00      0.25      0.40         4
           7       0.41      0.62      0.49        29
           8       0.71      0.83      0.77         6
           9       0.50      0.14      0.22         7
          10       0.50      1.00      0.67         2
          11       0.75      0.50      0.60         6
          12       0.88      0.54      0.67        13

    accuracy                           0.59       147
   macro avg       0.74      0.56      0.59       147
weighted

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define the model
mlp_model_leakyrelu = Sequential()

# Input layer
mlp_model_leakyrelu.add(Dense(2048, input_shape=(X_train_tfidf.shape[1],)))
mlp_model_leakyrelu.add(LeakyReLU(alpha=0.1))
mlp_model_leakyrelu.add(BatchNormalization())
mlp_model_leakyrelu.add(Dropout(0.2))

# Hidden layers
for units in [1024, 512, 256]:
    mlp_model_leakyrelu.add(Dense(units))
    mlp_model_leakyrelu.add(LeakyReLU(alpha=0.1))
    mlp_model_leakyrelu.add(BatchNormalization())
    mlp_model_leakyrelu.add(Dropout(0.3))

# Output layer
num_classes = len(np.unique(y_train))
mlp_model_leakyrelu.add(Dense(num_classes, activation='softmax'))

# Compile the model
mlp_model_leakyrelu.compile(optimizer=Adam(learning_rate=0.001),
                            loss='sparse_categorical_crossentropy',
                            metrics=['accuracy'])

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

# Train the model
history_leakyrelu = mlp_model_leakyrelu.fit(X_train_tfidf, y_train,
                                            validation_data=(X_test_tfidf, y_test),
                                            epochs=100,
                                            batch_size=64,
                                            callbacks=[lr_scheduler])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 110ms/step - accuracy: 0.1074 - loss: 3.3793 - val_accuracy: 0.3197 - val_loss: 2.5010 - learning_rate: 0.0010
Epoch 2/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.6001 - loss: 1.2550 - val_accuracy: 0.3129 - val_loss: 2.4218 - learning_rate: 0.0010
Epoch 3/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.8162 - loss: 0.6112 - val_accuracy: 0.3197 - val_loss: 2.3368 - learning_rate: 0.0010
Epoch 4/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.9235 - loss: 0.2792 - val_accuracy: 0.3197 - val_loss: 2.2618 - learning_rate: 0.0010
Epoch 5/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.9559 - loss: 0.1921 - val_accuracy: 0.3129 - val_loss: 2.1961 - learning_rate: 0.0010
Epoch 6/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
results = {
    'LeakyReLU': mlp_model_leakyrelu.evaluate(X_test_tfidf, y_test)
}

for name, (loss, accuracy) in results.items():
    print(f"{name} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6066 - loss: 2.0767 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5556 - loss: 2.0866
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5627 - loss: 2.1701 
LeakyReLU - Loss: 2.2209, Accuracy: 0.5918
Swish - Loss: 2.1965, Accuracy: 0.5510
ELU - Loss: 2.3686, Accuracy: 0.5578
