In [2]:
import pandas as pd
df = pd.read_csv("datasets/weather-agg-DFE.csv")

In [3]:
df.columns

Index(['_unit_id', '_canary', '_unit_state', '_trusted_judgments',
       '_last_judgment_at',
       'what_emotion_does_the_author_express_specifically_about_the_weather',
       'what_emotion_does_the_author_express_specifically_about_the_weather:confidence',
       'gold_answer', 'tweet_id', 'tweet_text'],
      dtype='object')

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Load data
data = pd.read_csv('datasets/weather-agg-DFE.csv')

# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub('[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize words
    return text

data['tweet_text'] = data['tweet_text'].apply(clean_text)

# Tokenization and sequence padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['tweet_text'])
sequences = tokenizer.texts_to_sequences(data['tweet_text'])
X = pad_sequences(sequences, maxlen=200)

# Preparing target variable using Label Encoding
encoder = LabelEncoder()
y = encoder.fit_transform(data['what_emotion_does_the_author_express_specifically_about_the_weather'])
y = np.array(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Model architecture
model = Sequential()
model.add(Embedding(5000, 100, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(np.unique(y).size, activation='softmax'))  # Adjust output layer for label encoding

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.1)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Predictions and evaluation
predictions = model.predict(X_test)
y_pred = np.argmax(predictions, axis=1)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/50




[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 122ms/step - accuracy: 0.2733 - loss: 1.5890 - val_accuracy: 0.2933 - val_loss: 1.4577
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - accuracy: 0.3118 - loss: 1.4134 - val_accuracy: 0.2400 - val_loss: 1.3799
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - accuracy: 0.3577 - loss: 1.3445 - val_accuracy: 0.2933 - val_loss: 1.3624
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - accuracy: 0.4238 - loss: 1.2877 - val_accuracy: 0.2533 - val_loss: 1.3349
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - accuracy: 0.5158 - loss: 1.1958 - val_accuracy: 0.4933 - val_loss: 1.2691
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - accuracy: 0.6655 - loss: 1.0879 - val_accuracy: 0.4667 - val_loss: 1.2273
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━