# **Sentiment Analysis**  

## 1-Importing Libiraries

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN
from sklearn.model_selection import train_test_split
import re

## 2-Reading Data

In [2]:
df=pd.read_csv("twitter_training.csv")

In [3]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


## 3-Data Cleaning

In [4]:
if 'tweet' in df.columns:
    df['tweet'] = df['tweet'].astype(str)

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'(#|@)\w*', '', text)  # Remove hashtags and mentions
    text = re.sub("https?:\/\/\S+", '', text)  # Remove links
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text

In [6]:
df['tweet']=df['tweet'].apply(clean_text)

In [7]:
training_data=[]
for _,i in df.iterrows():
    if i['Positive']=='Positive' or i['Positive']=='Negative':
        training_data.append(i)

In [8]:
training_data=pd.DataFrame(training_data)

## 4-Tokenizing

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
maxfeatures=3000
tokenizer=Tokenizer(num_words=maxfeatures,split=' ',lower=True ,oov_token='UNK')
tokenizer.fit_on_texts(training_data['tweet'].values)
x=tokenizer.texts_to_sequences(training_data['tweet'].values)
x=pad_sequences(x)


In [12]:
y=pd.get_dummies(training_data['Positive']).values

## 5-Builging Model (RNN)

In [13]:
Embedding_dimintion=256
rnn_units=196
model=Sequential()
model.add(Embedding(maxfeatures,Embedding_dimintion,input_length=x.shape[1]))
model.add(SimpleRNN(rnn_units,return_sequences=True))
model.add(SimpleRNN(rnn_units))
model.add(Dense(2,'softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [43]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [44]:
print(x_train.shape)
print(y_train.shape)

(34698, 166)
(34698, 2)


array([[ True, False],
       [False,  True],
       [ True, False],
       ...,
       [ True, False],
       [False,  True],
       [False,  True]])

## 6- Training 

In [18]:
model.fit(x_train,y_train,epochs=10,batch_size=35)

Epoch 1/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 56ms/step - accuracy: 0.6242 - loss: 0.6308
Epoch 2/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 56ms/step - accuracy: 0.7690 - loss: 0.5071
Epoch 3/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 55ms/step - accuracy: 0.8059 - loss: 0.4541
Epoch 4/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 55ms/step - accuracy: 0.8378 - loss: 0.3931
Epoch 5/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 56ms/step - accuracy: 0.8365 - loss: 0.3954
Epoch 6/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 57ms/step - accuracy: 0.8573 - loss: 0.3498
Epoch 7/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 56ms/step - accuracy: 0.8190 - loss: 0.4073
Epoch 8/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 56ms/step - accuracy: 0.8820 - loss: 0.2932
Epoch 9/10
[1m992/992[

<keras.src.callbacks.history.History at 0x283d17b9420>

## 7-Testing

In [19]:
y_pred=model.predict(x_test)

[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step


In [62]:
print(y_pred)

[[0.0353691  0.9646309 ]
 [0.02029717 0.9797029 ]
 [0.98212975 0.01787028]
 ...
 [0.08279402 0.917206  ]
 [0.76343215 0.2365678 ]
 [0.44393617 0.5560638 ]]


## 8- Evaluating Model

In [69]:

y_testing=pd.DataFrame(y_test)
y_testing=y_testing.iloc[:,-1].values
y_testing=y_testing.astype(int)
y_testing=pd.DataFrame(y_testing)
y_testing

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1
...,...
8670,0
8671,1
8672,1
8673,1


In [75]:
y_predict=pd.DataFrame(y_pred)
y_predict=y_predict.iloc[:,-1]
y_predict

0       0.964631
1       0.979703
2       0.017870
3       0.489267
4       0.944685
          ...   
8670    0.029721
8671    0.989850
8672    0.917206
8673    0.236568
8674    0.556064
Name: 1, Length: 8675, dtype: float32

In [76]:
y_predict = np.array(y_predict)

for i in range(len(y_predict)):
    if y_predict[i] < 0.5:
        y_predict[i] = 0
    else:
        y_predict[i] = 1

y_predict=pd.DataFrame(y_predict)

In [77]:
y_predict=y_predict.astype(int)
y_predict

Unnamed: 0,0
0,1
1,1
2,0
3,0
4,1
...,...
8670,0
8671,1
8672,1
8673,0


In [78]:
from sklearn.metrics import confusion_matrix,recall_score,precision_score,f1_score
cm = confusion_matrix(y_testing, y_predict)
recall = recall_score(y_testing, y_predict)
precision = precision_score(y_testing, y_predict)
f1 = f1_score(y_testing, y_predict)

# Print results
print(f"Confusion Matrix:\n{cm}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Confusion Matrix:
[[4027  490]
 [ 690 3468]]
Recall: 0.834054834054834
Precision: 0.876200101061142
F1 Score: 0.8546081813701331


## 9- Saving Model,Tokenizer,Cleaning func

In [83]:
import pickle
with open('model_tokenizer.pkl', 'wb') as file:
    pickle.dump((model, tokenizer),file) 