In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from joblib import dump

# Load and preprocess the dataset
data = pd.read_csv('Twitter_Data.csv')


In [3]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
# Convert 'clean_text' column to strings
data['clean_text'] = data['selected_text'].astype(str)
data['clean_text'] = data['clean_text'].str.replace('[^a-zA-Z\s]', '').str.lower()


In [5]:
X = data['clean_text']
y = data['sentiment']

In [6]:
unique_sentiments = y.unique()
print("Unique Sentiments:", unique_sentiments)

# Ensure that your labels are numeric
y = y.replace({'negative': 0, 'neutral': 1, 'positive': 2})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Unique Sentiments: ['neutral' 'negative' 'positive']


  y = y.replace({'negative': 0, 'neutral': 1, 'positive': 2})


In [7]:
# Tokenize and pad text sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')



In [8]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# One-hot encode labels
num_classes = len(unique_sentiments)
y_train_onehot = tf.keras.utils.to_categorical(y_train_encoded, num_classes=num_classes)
y_test_onehot = tf.keras.utils.to_categorical(y_test_encoded, num_classes=num_classes)

# Build a simple LSTM model with 3 output units


In [9]:
model = tf.keras.Sequential([
    Embedding(input_dim=5000, output_dim=100, input_length=100),
    LSTM(128),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])




In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with a reduced batch size
model.fit(X_train_pad, y_train_onehot, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test_onehot))

# Save the trained model
model.save('sentiment_model_old.h5')
dump(tokenizer, 'tokenizer.joblib')


Epoch 1/5
