<a href="https://colab.research.google.com/github/Salma-Kassem/DeepLearning/blob/main/LSTM_RNN_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

zip_path = '/content/drive/MyDrive/Classroom/Twitter_Sentiment_Analysis.zip'
extract_path = '/content/extracted_data'  # or any folder you'd like

# Create the extraction folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete.")


Extraction complete.


In [None]:
import pandas as pd

# Define correct column names
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']

# Load the CSV and tell pandas to use these names (not the first row as header)
df = pd.read_csv('/content/extracted_data/training.csv', encoding='cp1252', names=column_names)

# Preview the cleaned-up data
df.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
df['target'] = df['target'].replace(4, 1)
# First, split off 80% train, 20% temp (val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

# Then split temp_df into 50% val and 50% test → 10% each of total
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['target'])


In [None]:
print(len(train_df), len(test_df))

1280000 160000


In [None]:
import tensorflow as tf
# (train_df.shuffle(...)), but that method only exists on TensorFlow tf.data.Dataset objects, not on Pandas DataFrames.
# Hyperparameters
BUFFER_SIZE = 10000
BATCH_SIZE = 128

# function to convert a DataFrame into a tf.data.Dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=BATCH_SIZE):
    ds = tf.data.Dataset.from_tensor_slices((dataframe['text'].values, dataframe['target'].values))
    if shuffle:
        ds = ds.shuffle(buffer_size=BUFFER_SIZE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

# ✅ Convert your train/val/test DataFrames
train_dataset = df_to_dataset(train_df, shuffle=True)
val_dataset   = df_to_dataset(val_df, shuffle=False)
test_dataset  = df_to_dataset(test_df, shuffle=False)


In [None]:
for text,target in train_dataset.take(1):
  print('texts: ',text.numpy()[:3])
  print()
  print('targets: ',target.numpy()[:3])

texts:  [b'250km to go. Torture. And iPhone almost dead   http://tinyurl.com/crtkd8'
 b'@Hammerette_ Very similar to babybel on toast '
 b"yay!!! my cousin gabby's coming tomorrow!! i've missed her. oh and i miss my other favorite cousin lizzie and eric!!!! i miss you guys!!! "]

targets:  [0 1 0]


In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
encoded_example = encoder(text)[:3].numpy()
print(text)
encoded_example

tf.Tensor(
[b'250km to go. Torture. And iPhone almost dead   http://tinyurl.com/crtkd8'
 b'@Hammerette_ Very similar to babybel on toast '
 b"yay!!! my cousin gabby's coming tomorrow!! i've missed her. oh and i miss my other favorite cousin lizzie and eric!!!! i miss you guys!!! "
 b'its raining, the washing is out n im in my pjs, wat do i do '
 b'My hands are all stained from card making '
 b"I just heard that Jon &amp; Kate are getting a divorce. How could they do that?! What about the kids?! All 8 of them! I hope they'll be ok "
 b'@tommcfly http://twitpic.com/4ed3x - cant wait for Brazil!   Harry and Dougie look lost AND hot!'
 b'Its so quiet around here now ' b'White hot chocolate... Yum. '
 b"@here4cheer oh no!!!!     nobody can take the ice from you.....  it's like your baby."
 b'is relaxing. Have a migraine again. All well. 2 months till i turn 19!woot '
 b"@bigbrightbulb, that's good to hear.  And I agree in general on No Bullshit policies. Using them in business too? "
 b'@su

array([[  1,   3,  39,   1,   7, 357, 346, 613,   1,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  1, 118,   1,   3,   1,  15,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [207,   6,   1,   1, 263,  98, 132, 244, 102,  82,   7,   2,  85,
          6, 255, 557,   1,   1,   7,   1,   2,  85,   8, 198,   0,   0,
          0,   0,   0]])

In [None]:
print(len(encoded_example[0]))
print(len(encoded_example[1]))

29
29


# **simple RNN**

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    ),
    # Replace LSTM with SimpleRNN
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [None]:
model.compile(
    loss='binary_crossentropy',    # suitable loss for 0/1 labels
    optimizer='adam',
    metrics=['accuracy']
)


In [None]:
import time

# Start timer
start_time = time.time()
history = model.fit(train_dataset,
                    epochs=5,
                    validation_data=val_dataset
                   )
end_time = time.time()
training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds")


Epoch 1/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 52ms/step - accuracy: 0.7532 - loss: 0.4971 - val_accuracy: 0.7783 - val_loss: 0.4608
Epoch 2/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 53ms/step - accuracy: 0.7809 - loss: 0.4577 - val_accuracy: 0.7865 - val_loss: 0.4474
Epoch 3/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m566s[0m 53ms/step - accuracy: 0.7893 - loss: 0.4445 - val_accuracy: 0.7894 - val_loss: 0.4465
Epoch 4/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m567s[0m 53ms/step - accuracy: 0.7935 - loss: 0.4377 - val_accuracy: 0.7922 - val_loss: 0.4415
Epoch 5/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 54ms/step - accuracy: 0.7959 - loss: 0.4336 - val_accuracy: 0.7917 - val_loss: 0.4411
Training Time: 2748.74 seconds


In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.7939 - loss: 0.4380
Test Loss: 0.44003427028656006
Test Accuracy: 0.7923374772071838




# ***LSTM***

```



In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
model.compile(
    loss='binary_crossentropy',    # suitable loss for 0/1 labels
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
import time

# Start timer
start_time = time.time()
history = model.fit(train_dataset,
                    epochs=5,
                    validation_data=val_dataset
                   )
end_time = time.time()
training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds")


Epoch 1/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 8ms/step - accuracy: 0.7632 - loss: 0.4842 - val_accuracy: 0.7876 - val_loss: 0.4451
Epoch 2/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 8ms/step - accuracy: 0.7896 - loss: 0.4418 - val_accuracy: 0.7930 - val_loss: 0.4358
Epoch 3/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 8ms/step - accuracy: 0.7970 - loss: 0.4300 - val_accuracy: 0.7980 - val_loss: 0.4275
Epoch 4/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 8ms/step - accuracy: 0.8019 - loss: 0.4223 - val_accuracy: 0.7999 - val_loss: 0.4263
Epoch 5/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 9ms/step - accuracy: 0.8046 - loss: 0.4169 - val_accuracy: 0.8011 - val_loss: 0.4229
Training Time: 416.21 seconds


In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8044 - loss: 0.4193
Test Loss: 0.4220362901687622
Test Accuracy: 0.8024062514305115
