In [1]:
!pip install tensorflow pandas numpy scikit-learn




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Dropout


In [5]:
df = pd.read_csv("Tweets.csv")   

df = df[['text', 'sentiment']]      # keep only required columns
df.dropna(inplace=True)

df.head()


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [10]:
# ---------------------------
# 1. IMPORT LIBRARIES
# ---------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ---------------------------
# 2. LOAD DATASET
# ---------------------------
df = pd.read_csv("Tweets.csv", encoding='latin1', on_bad_lines='skip')
df.columns = df.columns.str.lower().str.strip()  # clean column names

print("Columns found:", df.columns)

# ---------------------------
# 3. FIND TEXT + LABEL COLUMNS
# ---------------------------
possible_text_cols = ["text", "tweet", "content", "sentimenttext"]
possible_label_cols = ["target", "label", "sentiment"]

text_col = next((c for c in possible_text_cols if c in df.columns), None)
label_col = next((c for c in possible_label_cols if c in df.columns), None)

print("Using text col:", text_col)
print("Using label col:", label_col)

# ---------------------------
# 4. CLEAN DATA
# ---------------------------
df = df[[text_col, label_col]]
df.dropna(inplace=True)

# ---------------------------
# 5. LABEL ENCODING (FIX FOR YOUR ERROR)
# ---------------------------
df[label_col] = df[label_col].astype(str).str.lower().str.strip()

label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2,
    "0": 0,
    "1": 1,
    "2": 2
}

df[label_col] = df[label_col].map(label_map)

# Remove rows where label is not mapped
df = df.dropna(subset=[label_col])

texts = df[text_col].astype(str).values
labels = df[label_col].astype(int).values

print("Unique labels after encoding:", np.unique(labels))

# ---------------------------
# 6. TOKENIZATION + PADDING
# ---------------------------
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# ---------------------------
# 7. TRAIN-TEST SPLIT
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    padded, labels, test_size=0.2, random_state=42
)

print("Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

print("✅ Preprocessing complete!")


Columns found: Index(['textid', 'text', 'selected_text', 'sentiment'], dtype='object')
Using text col: text
Using label col: sentiment
Unique labels after encoding: [0 1 2]
Shapes:
X_train: (21984, 100)
X_test: (5496, 100)
y_train: (21984,)
y_test: (5496,)
✅ Preprocessing complete!


In [11]:
cnn_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_history = cnn_model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)




Epoch 1/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 248ms/step - accuracy: 0.4029 - loss: -4328.9565 - val_accuracy: 0.4121 - val_loss: -30107.7070
Epoch 2/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 181ms/step - accuracy: 0.4188 - loss: -292852.9688 - val_accuracy: 0.4191 - val_loss: -831243.8125
Epoch 3/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 170ms/step - accuracy: 0.4209 - loss: -2199320.2500 - val_accuracy: 0.4260 - val_loss: -4227048.5000
Epoch 4/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 178ms/step - accuracy: 0.4246 - loss: -7761534.5000 - val_accuracy: 0.4182 - val_loss: -12384933.0000
Epoch 5/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 184ms/step - accuracy: 0.4221 - loss: -18968666.0000 - val_accuracy: 0.4203 - val_loss: -27211530.0000


In [17]:
cnn_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")

print("CNN Accuracy:", accuracy_score(y_test, cnn_pred))
print(classification_report(y_test, cnn_pred))


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 59ms/step
CNN Accuracy: 0.42358078602620086
              precision    recall  f1-score   support

           0       0.93      0.06      0.12      1572
           1       0.41      1.00      0.58      2236
           2       0.00      0.00      0.00      1688

    accuracy                           0.42      5496
   macro avg       0.45      0.35      0.23      5496
weighted avg       0.44      0.42      0.27      5496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
rnn_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

rnn_history = rnn_model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)




Epoch 1/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 154ms/step - accuracy: 0.4027 - loss: -0.4933 - val_accuracy: 0.4094 - val_loss: -2.0559
Epoch 2/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 140ms/step - accuracy: 0.4026 - loss: -1.6459 - val_accuracy: 0.4094 - val_loss: -4.9339
Epoch 3/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 144ms/step - accuracy: 0.4026 - loss: -3.4752 - val_accuracy: 0.4094 - val_loss: -9.0362
Epoch 4/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 150ms/step - accuracy: 0.4026 - loss: -5.6422 - val_accuracy: 0.4094 - val_loss: -14.3330
Epoch 5/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 172ms/step - accuracy: 0.4026 - loss: -8.5835 - val_accuracy: 0.4094 - val_loss: -20.4578


In [19]:
rnn_pred = (rnn_model.predict(X_test) > 0.5).astype("int32")

print("RNN Accuracy:", accuracy_score(y_test, rnn_pred))
print(classification_report(y_test, rnn_pred))


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step
RNN Accuracy: 0.40684133915574966
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1572
           1       0.41      1.00      0.58      2236
           2       0.00      0.00      0.00      1688

    accuracy                           0.41      5496
   macro avg       0.14      0.33      0.19      5496
weighted avg       0.17      0.41      0.24      5496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
