In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [2]:
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 80% 65.0M/80.9M [00:00<00:00, 207MB/s]
100% 80.9M/80.9M [00:00<00:00, 213MB/s]


In [3]:
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('the dataset is extracted')

the dataset is extracted


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

# Inspect the dataset
print(df.head())


   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....           

In [7]:
df.shape

(1599999, 6)

In [8]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names = column_names, encoding = 'ISO-8859-1')

In [9]:
twitter_data.head()
twitter_data.shape

(1600000, 6)

In [10]:
twitter_data.replace({'target':{'positive':1}}, inplace=True)
twitter_data.replace({'target':{'negative':0}}, inplace=True)

In [13]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [18]:
def clean_tweet(text):
    # Remove URLs, mentions, hashtags, special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()

# Apply cleaning
# Clean the tweets (now referencing the correct column 'text' and DataFrame 'twitter_data')
twitter_data['cleaned_tweet'] = twitter_data['text'].apply(clean_tweet)


In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize tokenizer
vocab_size = 10000  # Adjust vocabulary size based on your dataset
oov_token = "<OOV>"  # Token for out-of-vocabulary words
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# Fit tokenizer on the cleaned text
tokenizer.fit_on_texts(twitter_data['cleaned_tweet'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(twitter_data['cleaned_tweet'])


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum length for padding
max_length = 50  # You can adjust this based on your text length distribution
padding_type = 'post'  # Add padding at the end of sequences
truncating_type = 'post'  # Truncate sequences at the end if they exceed max_length

# Pad the sequences
padded_tweets = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)


In [21]:
# Check a few examples
print("Original text:", twitter_data['cleaned_tweet'][0])
print("Tokenized sequence:", sequences[0])
print("Padded sequence:", padded_tweets[0])


Original text:    a thats a bummer  you shoulda got david carr of third day to do it d
Tokenized sequence: [5, 102, 5, 1207, 8, 3427, 49, 863, 9710, 13, 1842, 32, 3, 41, 10, 384]
Padded sequence: [   5  102    5 1207    8 3427   49  863 9710   13 1842   32    3   41
   10  384    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [25]:
# Map the target column: 4 -> 1 for positive sentiment
twitter_data['encoded_sentiment'] = twitter_data['target'].map({0: 0, 4: 1})

# Check the unique values in the new column to confirm
print(twitter_data['encoded_sentiment'].unique())



[0 1]


In [26]:
from sklearn.model_selection import train_test_split

# Features and labels
X = padded_tweets
y = twitter_data['encoded_sentiment'].values  # Use 'encoded_sentiment' for labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
print(X_train.shape, X_test.shape)  # Check the shape of input data
print(y_train.shape, y_test.shape)  # Check the shape of labels


(1280000, 50) (320000, 50)
(1280000,) (320000,)


In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential()

# Embedding layer for word embeddings
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=50))  # You can adjust output_dim as needed

# LSTM layer
model.add(LSTM(units=128, return_sequences=False))

# Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Dense output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Summary of the model
model.summary()




In [29]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2344s[0m 117ms/step - accuracy: 0.5871 - loss: 0.6451 - val_accuracy: 0.8109 - val_loss: 0.4107
Epoch 2/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2354s[0m 118ms/step - accuracy: 0.8182 - loss: 0.3990 - val_accuracy: 0.8226 - val_loss: 0.3904
Epoch 3/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2401s[0m 120ms/step - accuracy: 0.8320 - loss: 0.3729 - val_accuracy: 0.8254 - val_loss: 0.3862
Epoch 4/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2402s[0m 119ms/step - accuracy: 0.8422 - loss: 0.3547 - val_accuracy: 0.8260 - val_loss: 0.3842
Epoch 5/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2439s[0m 120ms/step - accuracy: 0.8506 - loss: 0.3380 - val_accuracy: 0.8261 - val_loss: 0.3884


In [30]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 29ms/step - accuracy: 0.8251 - loss: 0.3878
Test Loss: 0.3883807957172394
Test Accuracy: 0.8260906338691711
