In [1]:
#!pip install streamlit

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pickle

In [3]:
def load_data(filepath):
    columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    data = pd.read_csv(filepath, encoding = "latin-1", names=columns)
    return data

In [4]:
df = load_data("/content/training.1600000.processed.noemoticon.csv")

  data = pd.read_csv(filepath, encoding = "latin-1", names=columns)


In [5]:
df

Unnamed: 0,target,id,date,flag,user,text
0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1048568,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum
1048569,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...
1048570,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...
1048571,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?


In [6]:
df.describe()

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,target,id,date,flag,user,text
count,1048573,1048573,1048573,1048573,1048573,1048573
unique,4,1048462,662451,2,511365,1036133
top,0,1957692870,Fri May 22 05:10:17 PDT 2009,NO_QUERY,lost_dog,isPlayer Has Died! Sorry
freq,668925,2,17,1048572,549,210


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048573 entries, 0 to 1048572
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1048573 non-null  object
 1   id      1048573 non-null  object
 2   date    1048573 non-null  object
 3   flag    1048573 non-null  object
 4   user    1048573 non-null  object
 5   text    1048573 non-null  object
dtypes: object(6)
memory usage: 48.0+ MB


In [8]:
df['text'][5]

'@Kwesidei not the whole crew '

In [9]:
df['text'][7]

"@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?"

In [10]:
df['tweet'] = df['text'].str.lower()

In [11]:
df['tweet'][7]

"@loltrish hey  long time no see! yes.. rains a bit ,only a bit  lol , i'm fine thanks , how's you ?"

In [12]:
df['tweet'] = df['tweet'].apply(lambda x:re.sub(r'[^a-zA-Z\s]',"",x))

In [13]:
df['tweet'][5]

'kwesidei not the whole crew '

In [14]:
df['tweet'][7]

'loltrish hey  long time no see yes rains a bit only a bit  lol  im fine thanks  hows you '

In [15]:
df["tweet_tokens"] = df['tweet'].apply(lambda x:x.split())

In [16]:
df["tweet_tokens"][7]

['loltrish',
 'hey',
 'long',
 'time',
 'no',
 'see',
 'yes',
 'rains',
 'a',
 'bit',
 'only',
 'a',
 'bit',
 'lol',
 'im',
 'fine',
 'thanks',
 'hows',
 'you']

In [17]:
lemma = WordNetLemmatizer()

In [18]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
len(stop_words)

179

In [20]:
df['tweet_refine'] = df['tweet_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [21]:
df['tweet_refine']

Unnamed: 0,tweet_refine
0,"[text, tweet]"
1,"[upset, cant, update, facebook, texting, might..."
2,"[kenichan, dived, many, times, ball, managed, ..."
3,"[whole, body, feels, itchy, like, fire]"
4,"[nationwideclass, behaving, im, mad, cant, see]"
...,...
1048568,"[grandma, making, dinenr, mum]"
1048569,"[midmorning, snack, time, bowl, cheese, noodle..."
1048570,"[shadela, say, like, terminiator, movies, come..."
1048571,"[destinyhope, im, great, thaanks, wbuu]"


In [22]:
nltk.download('wordnet')
df['tweet_refine'] = df['tweet_refine'].apply(lambda x: [lemma.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [23]:
df['tweet_refine'][5634]

['kinsey',
 'whearty',
 'put',
 'much',
 'milk',
 'coffee',
 'chicory',
 'oh',
 'sadness',
 'ala']

In [24]:
X = df["tweet_refine"]
y = df['target']

In [25]:
len(X[2452])

5

In [26]:
tokenizer = Tokenizer(num_words=100000,oov_token="<OOV>")

In [27]:
tokenizer.fit_on_texts(X)

In [28]:
X_tokenized = tokenizer.texts_to_sequences(X)

In [29]:
len(X_tokenized[454])

8

In [30]:
X[2452]

['nasty', 'scraping', 'noise', 'back', 'car']

In [31]:
X_padded = pad_sequences(X_tokenized, maxlen=50)

In [32]:
len(X_padded[534])

50

In [33]:
(X_padded)

array([[    0,     0,     0, ...,     0,   429,   106],
       [    0,     0,     0, ...,    11,   191,   902],
       [    0,     0,     0, ...,   385,     5,  3115],
       ...,
       [    0,     0,     0, ...,    53,     6,   412],
       [    0,     0,     0, ...,    61, 38316, 93523],
       [    0,     0,     0, ...,   398,   609,    81]], dtype=int32)

In [34]:
y.unique()

array(['polarity of tweet\xa0', '0', 0, 4], dtype=object)

In [35]:
# Build LSTM model
def build_lstm_model(vocab_size, embedding_dim=100, max_len=50):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [36]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=4, batch_size=64):
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        verbose=1
    )
    return history

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [38]:
vocab_size = len(tokenizer.word_index) + 1

In [39]:
vocab_size

545988

In [40]:
model = build_lstm_model(vocab_size)



In [41]:
import numpy as np

# Assuming X_train, X_val, y_train, y_val are your NumPy arrays

# Convert the input features to float32
X_train = X_train.astype(np.float32)
X_val = X_val.astype(np.float32)

# Convert the target variables to float32 if they are numerical
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)

In [42]:
history = train_model(model, X_train, y_train, X_val, y_val)

Epoch 1/4
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 39ms/step - accuracy: 0.3732 - loss: -8820.8916 - val_accuracy: 0.4967 - val_loss: -71352.5547
Epoch 2/4
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 39ms/step - accuracy: 0.4093 - loss: -118011.9297 - val_accuracy: 0.4976 - val_loss: -256965.8750
Epoch 3/4
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 39ms/step - accuracy: 0.4181 - loss: -368519.9062 - val_accuracy: 0.5457 - val_loss: -577443.5000
Epoch 4/4
[1m10486/10486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 39ms/step - accuracy: 0.4343 - loss: -761701.6875 - val_accuracy: 0.5538 - val_loss: -992803.5000


In [43]:
 # Save the model and tokenizer
model.save('sentiment_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [44]:
# Evaluate the model on validation data
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")



[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.5528 - loss: -987664.5000
Validation Loss: -992804.75
Validation Accuracy: 0.5538111329078674


In [48]:
model.summary()

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved model and tokenizer
from tensorflow.keras.models import load_model
model = load_model('sentiment_model.h5')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Function to predict sentiment for a single input
def SentimentAnalysis(text):
    sentence = [text]
    tokenized_sentence = tokenizer.texts_to_sequences(sentence)
    input_sequence = pad_sequences(tokenized_sentence, maxlen=50, padding='pre')

    prediction_ = model.predict(input_sequence)
    print(prediction_)

    # Binary classification, threshold of 0.5 to classify as Positive or Negative
    prediction = (prediction_ > 0.5).astype(int)[0][0]
    print(f"Predicted class: {prediction}")

    sentiment_classes = ['Negative', 'Positive']

    if prediction in range(2):
        confidence = prediction_[0][0] if prediction == 1 else 1 - prediction_[0][0]
        print(f"Sentiment: {sentiment_classes[prediction]} [confidence - {confidence * 100:.2f}%]")

    # Plot confidence for binary classification
    plot = pd.DataFrame([confidence], columns=['Confidence'])
    plot.plot(kind='barh')

# Now you can test this function with your data.

# Test the model on test data
def test_on_data(test_data):
    # Assuming 'test_data' is a list of text entries for testing
    for text in test_data:
        print(f"Testing on: {text}")
        SentimentAnalysis(text)
        print('-' * 50)

# User input to get predictions
def predict_user_input():
    while True:
        user_input = input("Enter a sentence to analyze sentiment (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        SentimentAnalysis(user_input)

# Example test cases
test_data = [
    "I love this product, it is amazing!",
    "This is the worst service I have ever experienced.",
    "It's okay, not bad but not great either.",
    "Absolutely wonderful experience!",
    "I am so disappointed, it's terrible!"
]

# Test on the sample test data
test_on_data(test_data)

# Allow user input for custom predictions
predict_user_input()




Testing on: I love this product, it is amazing!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step
[[1.]]
Predicted class: 1
Sentiment: Positive [confidence - 100.00%]
--------------------------------------------------
Testing on: This is the worst service I have ever experienced.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[[0.08586126]]
Predicted class: 0
Sentiment: Negative [confidence - 91.41%]
--------------------------------------------------
Testing on: It's okay, not bad but not great either.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[[0.04076001]]
Predicted class: 0
Sentiment: Negative [confidence - 95.92%]
--------------------------------------------------
Testing on: Absolutely wonderful experience!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[[1.]]
Predicted class: 1
Sentiment: Positive [confidence - 100.00%]
--------------------------------------------------
Testing 