In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk  # Import nltk before using it

nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: Improves WordNet performance

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
/kaggle/input/social-media-sentiments-analysis-dataset/sentimentdataset.csv


In [2]:
import pandas as pd

train_df = pd.read_csv("/kaggle/input/social-media-sentiments-analysis-dataset/sentimentdataset.csv")

In [3]:
import nltk
import subprocess
from nltk.stem import WordNetLemmatizer 

nltk.download('wordnet', download_dir='/kaggle/working/')
nltk.download('omw-1.4', download_dir='/kaggle/working/')

command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
subprocess.run(command.split())

nltk.data.path.append('/kaggle/working/')

lemmatizer = WordNetLemmatizer()

train_df['Sentiment'] = train_df['Sentiment'].astype(str).str.strip().str.lower().apply(lemmatizer.lemmatize)
print(train_df['Sentiment'].value_counts())

[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data] Downloading package omw-1.4 to /kaggle/working/...
Sentiment
positive         45
joy              44
excitement       37
contentment      19
neutral          18
                 ..
joy in baking     1
elegance          1
pensive           1
melodic           1
festivejoy        1
Name: count, Length: 191, dtype: int64


In [4]:
unique_values = set()
unique_values.update(train_df['Sentiment'])
unique_values

{'acceptance',
 'accomplishment',
 'admiration',
 'adoration',
 'adrenaline',
 'adventure',
 'affection',
 'amazement',
 'ambivalence',
 'amusement',
 'anger',
 'anticipation',
 'anxiety',
 'appreciation',
 'apprehensive',
 'arousal',
 'artisticburst',
 'awe',
 'bad',
 'betrayal',
 'bitter',
 'bitterness',
 'bittersweet',
 'blessed',
 'boredom',
 'breakthrough',
 'calmness',
 'captivation',
 'celebration',
 'celestial wonder',
 'challenge',
 'charm',
 'colorful',
 'compassion',
 'compassionate',
 'confidence',
 'confident',
 'confusion',
 'connection',
 'contemplation',
 'contentment',
 'coziness',
 'creative inspiration',
 'creativity',
 'culinary adventure',
 'culinaryodyssey',
 'curiosity',
 'darkness',
 'dazzle',
 'desolation',
 'despair',
 'desperation',
 'determination',
 'devastated',
 'disappointed',
 'disappointment',
 'disgust',
 'dismissive',
 'dreamchaser',
 'ecstasy',
 'elation',
 'elegance',
 'embarrassed',
 'emotion',
 'emotionalstorm',
 'empathetic',
 'empowerment',
 'e

In [5]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
!python -m spacy download en_core_web_md

nlp = spacy.load("en_core_web_md")

ref_words = {
    "Positive": "positive",
    "Negative": "negative",
    "Neutral": "neutral"
}

ref_vectors = {category: nlp(word).vector for category, word in ref_words.items()}

def assign_sentiment_category(sentiment):
    word_vector = nlp(sentiment).vector.reshape(1, -1)
    
    similarities = {}
    for category, ref_vec in ref_vectors.items():
        ref_vec = ref_vec.reshape(1, -1)
        sim = cosine_similarity(word_vector, ref_vec)[0][0]
        similarities[category] = sim

    return max(similarities, key=similarities.get)

train_df['Sentiment'] = train_df['Sentiment'].apply(assign_sentiment_category)

print(train_df['Sentiment'].value_counts())

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m00:01[0mm0:01[0mm
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Sentiment
Positive    364
Negative    290
Neutral      78
Name: count, dtype: int64


In [6]:
!pip install nltk  

import nltk
nltk.download('stopwords')  
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import re
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F700-\U0001F77F" 
                           u"\U0001F780-\U0001F7FF"  
                           u"\U0001F800-\U0001F8FF" 
                           u"\U0001F900-\U0001F9FF" 
                           u"\U0001FA00-\U0001FA6F" 
                           u"\U0001FA70-\U0001FAFF" 
                           u"\U00002702-\U000027B0" 
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

def split_and_remove_stopwords(text):
    text = emoji_pattern.sub(r'', text)

    tokens = re.findall(r"[\w]+|[^\w\s]", text)

    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    return filtered_tokens

train_df['Text'] = train_df['Text'].apply(split_and_remove_stopwords)

print(train_df['Text'])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0                    [Enjoying, beautiful, day, park, !]
1                        [Traffic, terrible, morning, .]
2                        [finished, amazing, workout, !]
3               [Excited, upcoming, weekend, getaway, !]
4              [Trying, new, recipe, dinner, tonight, .]
                             ...                        
727    [Collaborating, science, project, received, re...
728    [Attending, surprise, birthday, party, organiz...
729    [Successfully, fundraising, school, charity, i...
730    [Participating, multicultural, festival, ,, ce...
731    [Organizing, virtual, talent, show, challengin...
Name: Text, Lengt

In [7]:
from gensim.models import Word2Vec
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Train Word2Vec using skip-gram with increased epochs (50)
sentences = train_df['Text'].tolist()
w2v_model = Word2Vec(sentences, vector_size=300, window=7, min_count=2, workers=4, sg=1, negative=10, epochs=50)
w2v_model.save("word2vec.model")

def get_sentence_vector(sentence, model, vector_size=300):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        vectors = np.array(vectors)
        avg_vec = np.mean(vectors, axis=0)
        max_vec = np.max(vectors, axis=0)
        min_vec = np.min(vectors, axis=0)
        return np.concatenate([avg_vec, max_vec, min_vec])
    else:
        return np.zeros(vector_size * 3)

train_df['vector'] = train_df['Text'].apply(lambda x: get_sentence_vector(x, w2v_model))

label_encoder = LabelEncoder()
train_df['Sentiment'] = label_encoder.fit_transform(train_df['Sentiment'])
X = np.vstack(train_df['vector'].values)
y = train_df['Sentiment'].values.astype(np.int32)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Sequential([
        Dense(2048, input_shape=(900,)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(1024),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(512),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(256),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.4),
        
        Dense(128),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.4),
        
        Dense(3, activation='softmax')
    ])
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=1e-6)
    
    model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), callbacks=[lr_scheduler], verbose=1)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    accuracies.append(accuracy)
    
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 827ms/step - accuracy: 0.4149 - loss: 1.6426 - val_accuracy: 0.4626 - val_loss: 1.0573 - learning_rate: 0.0010
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4777 - loss: 1.4738 - val_accuracy: 0.4830 - val_loss: 1.0295 - learning_rate: 0.0010
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5378 - loss: 1.2478 - val_accuracy: 0.4830 - val_loss: 1.0501 - learning_rate: 0.0010
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5815 - loss: 1.1801 - val_accuracy: 0.5102 - val_loss: 1.0181 - learning_rate: 0.0010
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5847 - loss: 1.0945 - val_accuracy: 0.5306 - val_loss: 0.9309 - learning_rate: 0.0010
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s