In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tensorflow import keras
import os
import matplotlib.pyplot as plt
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("Tweets.csv")

In [3]:
df.shape

(27481, 4)

In [4]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
print((df.sentiment == "neutral").sum())
print((df.sentiment == "negative").sum())
print((df.sentiment == "positive").sum())

11118
7781
8582


In [6]:
import re
import string 

def remove_URL(text):
    if isinstance(text, str):
        url = re.compile(r"https?://\S+|www\.\S+")
        return url.sub(r"", text)
    else:
        return text

def remove_punct(text):
    if isinstance(text, str):
        translator = str.maketrans("", "", string.punctuation)
        return text.translate(translator)
    else: 
        return text

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth
www.dothebouncy
 - some shameless plugging for the best Rangers forum on earth


In [8]:
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    if isinstance(text, str):
        filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
        return " ".join(filtered_words)
    else:
        return text

df["text"] = df.text.map(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
df["text"] = df.text.map(remove_stopwords)

In [12]:
df.text

0                                       id responded going
1                                  sooo sad miss san diego
2                                            boss bullying
3                                    interview leave alone
4                 sons couldnt put releases already bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ive wondered rake client made clear net dont f...
27478    yay good enjoy break probably need hectic week...
27479                                                worth
27480                   flirting going atg smiles yay hugs
Name: text, Length: 27481, dtype: object

In [13]:
from collections import Counter

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        if isinstance(text, str):
            for word in text.split():
                count[word] +=1
    return count

counter = counter_word(df.text) 

In [14]:
len(counter)

27773

In [15]:
counter

Counter({'im': 3024,
         'day': 2044,
         'good': 1549,
         'get': 1426,
         'like': 1346,
         'go': 1267,
         'dont': 1200,
         'love': 1122,
         'work': 1112,
         'going': 1096,
         'today': 1096,
         'got': 1073,
         'cant': 1021,
         'happy': 976,
         'one': 971,
         'lol': 949,
         'time': 943,
         'know': 932,
         'u': 923,
         'really': 908,
         'back': 891,
         'see': 797,
         'well': 744,
         'new': 740,
         'night': 737,
         'home': 717,
         'mothers': 717,
         'want': 714,
         'think': 705,
         'still': 679,
         '2': 665,
         'oh': 663,
         'much': 659,
         'thanks': 657,
         'miss': 614,
         'great': 611,
         'hope': 589,
         'last': 581,
         'need': 570,
         'morning': 556,
         'haha': 555,
         'thats': 551,
         'ill': 534,
         'fun': 527,
         'feel': 519,


In [16]:
counter.most_common(5)

[('im', 3024), ('day', 2044), ('good', 1549), ('get', 1426), ('like', 1346)]

In [17]:
num_unique_words = len(counter)

In [18]:
train_size = int(df.shape[0] * 0.8)
train_df = df[:train_size]
val_df = df[train_size:]

In [19]:
train_sentences = train_df.text.to_numpy()
train_labels = train_df.sentiment.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.sentiment.to_numpy()

In [20]:
train_sentences.shape, val_sentences.shape

((21984,), (5497,))

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
train_sentences = [text for text in train_sentences if isinstance(text, str)]
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) 
word_index = tokenizer.word_index

In [22]:
word_index = tokenizer.word_index

In [23]:
word_index

{'im': 1,
 'day': 2,
 'good': 3,
 'get': 4,
 'like': 5,
 'go': 6,
 'dont': 7,
 'love': 8,
 'going': 9,
 'today': 10,
 'work': 11,
 'got': 12,
 'cant': 13,
 'one': 14,
 'happy': 15,
 'lol': 16,
 'know': 17,
 'u': 18,
 'time': 19,
 'back': 20,
 'really': 21,
 'see': 22,
 'well': 23,
 'new': 24,
 'home': 25,
 'think': 26,
 'night': 27,
 'want': 28,
 'mothers': 29,
 'still': 30,
 'thanks': 31,
 '2': 32,
 'oh': 33,
 'much': 34,
 'great': 35,
 'hope': 36,
 'miss': 37,
 'need': 38,
 'last': 39,
 'morning': 40,
 'thats': 41,
 'haha': 42,
 'ill': 43,
 'fun': 44,
 'feel': 45,
 'tomorrow': 46,
 'twitter': 47,
 'wish': 48,
 'would': 49,
 'didnt': 50,
 'sad': 51,
 'sorry': 52,
 'bad': 53,
 'tonight': 54,
 'right': 55,
 'week': 56,
 'nice': 57,
 'yeah': 58,
 'gonna': 59,
 'better': 60,
 'make': 61,
 'though': 62,
 'way': 63,
 '3': 64,
 'ive': 65,
 'sleep': 66,
 'come': 67,
 'could': 68,
 'weekend': 69,
 'getting': 70,
 'bed': 71,
 'watching': 72,
 'next': 73,
 'people': 74,
 'youre': 75,
 'awesome':

In [24]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [25]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['much love hopeful reckon chances minimal p im never gonna get cake stuff', 'really really like song love story taylor swift', 'sharpie running dangerously low ink', 'want go music tonight lost voice', 'test test lg env2']
[[34, 8, 2472, 2774, 3174, 8867, 495, 1, 88, 59, 4, 580, 218], [21, 21, 5, 190, 8, 659, 956, 2058], [8868, 415, 8869, 746, 5844], [28, 6, 206, 54, 185, 925], [460, 460, 4488, 8870]]


In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 20
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post")
train_padded.shape, val_padded.shape

((21983, 20), (5497, 20))

In [28]:
train_padded[10]

array([  34,    8, 2472, 2774, 3174, 8867,  495,    1,   88,   59,    4,
        580,  218,    0,    0,    0,    0,    0,    0,    0])

In [29]:
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [30]:
reverse_word_index

{1: 'im',
 2: 'day',
 3: 'good',
 4: 'get',
 5: 'like',
 6: 'go',
 7: 'dont',
 8: 'love',
 9: 'going',
 10: 'today',
 11: 'work',
 12: 'got',
 13: 'cant',
 14: 'one',
 15: 'happy',
 16: 'lol',
 17: 'know',
 18: 'u',
 19: 'time',
 20: 'back',
 21: 'really',
 22: 'see',
 23: 'well',
 24: 'new',
 25: 'home',
 26: 'think',
 27: 'night',
 28: 'want',
 29: 'mothers',
 30: 'still',
 31: 'thanks',
 32: '2',
 33: 'oh',
 34: 'much',
 35: 'great',
 36: 'hope',
 37: 'miss',
 38: 'need',
 39: 'last',
 40: 'morning',
 41: 'thats',
 42: 'haha',
 43: 'ill',
 44: 'fun',
 45: 'feel',
 46: 'tomorrow',
 47: 'twitter',
 48: 'wish',
 49: 'would',
 50: 'didnt',
 51: 'sad',
 52: 'sorry',
 53: 'bad',
 54: 'tonight',
 55: 'right',
 56: 'week',
 57: 'nice',
 58: 'yeah',
 59: 'gonna',
 60: 'better',
 61: 'make',
 62: 'though',
 63: 'way',
 64: '3',
 65: 'ive',
 66: 'sleep',
 67: 'come',
 68: 'could',
 69: 'weekend',
 70: 'getting',
 71: 'bed',
 72: 'watching',
 73: 'next',
 74: 'people',
 75: 'youre',
 76: 'aweso

In [31]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [32]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[34, 8, 2472, 2774, 3174, 8867, 495, 1, 88, 59, 4, 580, 218]
much love hopeful reckon chances minimal p im never gonna get cake stuff


In [33]:
from tensorflow.keras import layers
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(3, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            888736    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 3)                 195       
                                                                 
Total params: 913763 (3.49 MB)
Trainable params: 913763 (3.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
loss = keras.losses.SparseCategoricalCrossentropy()
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]
model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [35]:
sentiment_mapping = {"neutral": 0, "positive": 1, "negative": 2}
train_labels = np.array([sentiment_mapping[label] for label in train_labels])
val_labels = np.array([sentiment_mapping[label] for label in val_labels])

In [37]:
train_labels = train_labels[:-1]

In [38]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/20
687/687 - 25s - loss: 1.0883 - accuracy: 0.4022 - val_loss: 1.0861 - val_accuracy: 0.4128 - 25s/epoch - 36ms/step
Epoch 2/20
687/687 - 15s - loss: 1.0713 - accuracy: 0.4203 - val_loss: 1.1035 - val_accuracy: 0.3882 - 15s/epoch - 22ms/step
Epoch 3/20
687/687 - 21s - loss: 0.9292 - accuracy: 0.5502 - val_loss: 1.2283 - val_accuracy: 0.3593 - 21s/epoch - 31ms/step
Epoch 4/20
687/687 - 18s - loss: 0.7034 - accuracy: 0.6974 - val_loss: 1.4056 - val_accuracy: 0.3682 - 18s/epoch - 26ms/step
Epoch 5/20
687/687 - 20s - loss: 0.5443 - accuracy: 0.7648 - val_loss: 1.5928 - val_accuracy: 0.3673 - 20s/epoch - 29ms/step
Epoch 6/20
687/687 - 20s - loss: 0.4476 - accuracy: 0.8038 - val_loss: 1.8225 - val_accuracy: 0.3391 - 20s/epoch - 28ms/step
Epoch 7/20
687/687 - 18s - loss: 0.3855 - accuracy: 0.8259 - val_loss: 1.8534 - val_accuracy: 0.3551 - 18s/epoch - 26ms/step
Epoch 8/20
687/687 - 19s - loss: 0.3420 - accuracy: 0.8426 - val_loss: 2.3056 - val_accuracy: 0.3589 - 19s/epoch - 28ms/step


<keras.src.callbacks.History at 0x194ebcbda90>

In [39]:
predictions = model.predict(train_padded)
predicted_labels = [np.argmax(pred) for pred in predictions]



In [40]:
everse_sentiment_mapping = {v: k for k, v in sentiment_mapping.items()}
predicted_labels = [reverse_sentiment_mapping[label] for label in predicted_labels]

In [41]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(predicted_labels[10:20])

['much love hopeful reckon chances minimal p im never gonna get cake stuff', 'really really like song love story taylor swift', 'sharpie running dangerously low ink', 'want go music tonight lost voice', 'test test lg env2', 'uh oh sunburned', 'sok trying plot alternatives speak sigh', 'ive sick past days thus hair looks wierd didnt hat would look', 'back home gonna miss every one', 'hes']
[0 1 2 2 0 2 2 2 2 0]
['neutral', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'negative', 'neutral', 'neutral']
