In [1]:
import pandas as pd

## Get data 

In [40]:
dfcsv = pd.read_csv("../Tweets.csv")
#dfcsv means pandas dataframe from tweets csv

In [3]:
dfcsv.head(10)

Unnamed: 0,textID,text,usable_text,sentiment
0,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
1,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
2,50e14c0bb8,Soooo high,Soooo high,neutral
3,e050245fbd,Both of you,Both of you,neutral
4,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive
5,2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral
6,16fab9f95b,I really really like the song Love Story by Ta...,like,positive
7,74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative
8,04dd1d2e34,i want to go to music tonight but i lost my vo...,lost,negative
9,bbe3cbf620,test test from the LG enV2,test test from the LG enV2,neutral


In [4]:
dfcsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27476 entries, 0 to 27475
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   textID       27476 non-null  object
 1   text         27475 non-null  object
 2   usable_text  27475 non-null  object
 3   sentiment    27476 non-null  object
dtypes: object(4)
memory usage: 858.8+ KB


In [5]:
dfcsv.isnull().sum()

textID         0
text           1
usable_text    1
sentiment      0
dtype: int64

# Data cleansing

### Remove null values and irrelevant columns

In [6]:
dfcsv.drop(["text","textID"],axis=1,inplace=True) # remove text and text ID columns

In [7]:
dfcsv.dropna(inplace=True) # removes rows with null values

In [8]:
dfcsv.isnull().sum()

usable_text    0
sentiment      0
dtype: int64

In [9]:
dfcsv.head(10)

Unnamed: 0,usable_text,sentiment
0,http://www.dothebouncy.com/smf - some shameles...,neutral
1,fun,positive
2,Soooo high,neutral
3,Both of you,neutral
4,Wow... u just became cooler.,positive
5,"as much as i love to be hopeful, i reckon the ...",neutral
6,like,positive
7,DANGERously,negative
8,lost,negative
9,test test from the LG enV2,neutral


In [10]:
import tensorflow as tf
import numpy as np
import re

2022-10-25 16:10:39.176470: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-25 16:10:39.438990: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-25 16:10:39.499322: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-25 16:10:39.499352: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [11]:
import re
def clean_data(text):
    # Removing mentions
    text = re.sub(r"/b@","",text)
        
    # Remove links
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    
    # Remove digits & strings of digits
    text = re.sub(r"([0-9])","",text)
    
    # Remove new lines
    text = re.sub(r"\n","",text)
    
    # Removes all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    
    return text

In [12]:
print(clean_data("Magic man @Adam"))
print(clean_data("#coding"))
print(clean_data("adam 0743"))
print(clean_data("Here is a link :http://localhost:8888/notebooks/Model/Keras%20Model%20using%20pandas.ipynb#Evaluate-model"))

Magic man 
coding
adam 
Here is a link 


In [13]:
dfcsv["usable_text"] = dfcsv["usable_text"].apply(clean_data)

In [14]:
dfcsv.head(10)

Unnamed: 0,usable_text,sentiment
0,some shameless plugging for the best Rangers...,neutral
1,fun,positive
2,Soooo high,neutral
3,Both of you,neutral
4,Wow u just became cooler,positive
5,as much as i love to be hopeful i reckon the c...,neutral
6,like,positive
7,DANGERously,negative
8,lost,negative
9,test test from the LG enV,neutral


### Converting sentiment string into integer

In [15]:
def sentiment_encoder(data):
    if data == "positive":
        data = 1
    elif data == "negative":
        data = 0
    elif data == "neutral":
        data = 0.5
    else:
        data = None
        
    return data

In [16]:
dfcsv.isnull().sum()

usable_text    0
sentiment      0
dtype: int64

In [17]:
dfcsv["sentiment"] = dfcsv["sentiment"].apply(sentiment_encoder)

In [18]:
dfcsv.head(10)

Unnamed: 0,usable_text,sentiment
0,some shameless plugging for the best Rangers...,0.5
1,fun,1.0
2,Soooo high,0.5
3,Both of you,0.5
4,Wow u just became cooler,1.0
5,as much as i love to be hopeful i reckon the c...,0.5
6,like,1.0
7,DANGERously,0.0
8,lost,0.0
9,test test from the LG enV,0.5


## Split data into training and testing data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# Test size is 30% of total data from data set
x_train, x_test, y_train, y_test = train_test_split(dfcsv["usable_text"].values, dfcsv["sentiment"].values,
                                                    test_size=0.3)

# Checking the sentiment has been split up appropriately 
print('sentiment Text: ', x_train[2])
print('sentiment: ', y_train[2])

sentiment Text:  bad
sentiment:  0.5


## Tokenizing data

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
tokenizer = Tokenizer(num_words=100000000)
totalWords = [*x_train, *x_test]
tokenizer.fit_on_texts(totalWords)


In [23]:
wordindex = tokenizer.word_index
VocabSize= len(wordindex)
print("The number of unique words is: ", VocabSize)

The number of unique words is:  17881


In [24]:
train_seq = tokenizer.texts_to_sequences(x_train) # Compares all the data to vocab and assigns a integer reference for the vocab 
test_seq = tokenizer.texts_to_sequences(x_test)
print(test_seq)

[[212], [516, 1560, 389, 197], [475, 166, 207, 5607, 206, 648, 2, 1207, 308, 3474], [1312], [5, 3534, 94, 428, 25, 139, 23, 4, 578, 14475], [166, 261, 1, 58, 100, 12, 76, 472, 319, 3, 1893, 1, 42, 3, 2115, 140, 415, 225, 18, 109, 98, 418, 1105], [14476], [8, 364, 23, 665, 48, 19, 1568, 83, 3393, 378, 122, 8, 2, 4069], [1800, 28, 66, 136, 3, 726, 62, 2, 3, 260, 28, 14477, 53], [421], [1401, 679, 15, 1085], [12, 15, 5, 126], [46, 2, 23, 176, 1150], [1, 29, 5, 14478, 65, 354, 4137, 3339, 2835, 333, 74, 16, 134], [158, 7, 156, 1077], [4, 63, 916], [14479], [355], [1, 39, 60, 88, 2, 493, 8, 2, 6, 3219, 247, 14, 81, 11, 4, 209, 7, 1, 59, 185, 72, 126, 2, 42, 8], [458], [831, 13], [160, 243, 61, 193, 14480], [416, 144], [220, 92, 1790, 14481, 3058], [12, 240, 1939, 1083, 48], [786], [409, 3, 83, 670, 763, 28, 108], [1, 109, 664, 3, 670, 769, 531, 14482, 8, 226, 1113, 100, 1, 6580, 10, 2, 5, 531, 634], [30], [9, 71, 632, 2, 72, 4472, 107, 1621, 80, 6, 42, 8, 544, 1, 150, 49, 2, 292, 6, 19, 447

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [26]:
# PAD sequences so they are all the same length as the inputs must all be of same length

train_padded = pad_sequences(train_seq)

print(train_padded[0])

# Check size of each padded sequence

TrainingPadShape = train_padded.shape[1] # for use when defining the model shape 
print(TrainingPadShape)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 124 143 150 395  18  19   1 246 143 150  55 867   8]
31


In [27]:
test_padded = pad_sequences(test_seq,maxlen=TrainingPadShape)
print(test_padded.shape[1])

31


## Constructing the model

In [28]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model

In [29]:
DimentionEmbeddingLayer = 10
DimentionLSTMLayer = 30

inputLayer = Input(shape=(TrainingPadShape, ))
modelLayers = Embedding(VocabSize +1, DimentionEmbeddingLayer)(inputLayer) # Vocabsize + 1 as indexing starts from 0
modelLayers = LSTM(DimentionLSTMLayer, return_sequences=True)(modelLayers)
modelLayers = GlobalMaxPooling1D()(modelLayers)
modelLayers = Dense(32, activation="relu")(modelLayers)
modelLayers = Dense(1, activation="sigmoid")(modelLayers)

model = Model(inputLayer, modelLayers)

2022-10-25 16:12:14.974763: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-25 16:12:14.974989: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-25 16:12:14.975023: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (adamo-Surface-Pro-7): /proc/driver/nvidia/version does not exist
2022-10-25 16:12:14.975570: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Compiling the model 

In [30]:
model.compile(optimizer="RMSprop",loss="binary_crossentropy",metrics=["accuracy"])

## Fitting data

In [31]:
from keras.callbacks import ModelCheckpoint

### Fitting data once

In [32]:
fittingdata= model.fithistory = model.fit(train_padded, y_train, epochs=1,validation_data=(test_padded, y_test))



### Fitting data when finalised everything else

In [None]:
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model.fit(train_padded, y_train, epochs=70,validation_data=(test_padded, y_test),callbacks=[checkpoint1])

## Evaluate Model

In [33]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 31)]              0         
                                                                 
 embedding (Embedding)       (None, 31, 10)            178820    
                                                                 
 lstm (LSTM)                 (None, 31, 30)            4920      
                                                                 
 global_max_pooling1d (Globa  (None, 30)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                992       
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                             

In [34]:
score = best_model.evaluate(test_padded, y_test)
print("Model Loss: ", score[0])
print("Model accuracy", score[1])

NameError: name 'best_model' is not defined

## Predicting data

In [37]:
def predict_sentiment(text):
    text_sequence = tokenizer.texts_to_sequences(text)

    text_padded = pad_sequences(text_sequence, maxlen=TrainingPadShape)
    
    predicted_sentiment = model.predict(text_padded)
    if predicted_sentiment < 0.3:
        print("Negative")
    elif predicted_sentiment > 0.6:
        print("Positive")
    else:
        print("Neutral")
    print(predicted_sentiment)

In [38]:
text = ['Drone war 1']
predict_sentiment(text)

Neutral
[[0.4655835]]
