# 2) Project B: Sentiment Analysis using ANN architecture

In [42]:
#Import required Libraries

import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

A) Perform text processing - preprocessing and Word Embeddings

In [43]:
#A1) Import your dataset (sentiment_analysis.csv) 

# Import dataset
ds = pd.read_csv('sentiment_analysis.csv', encoding="utf8")

# Display your dataset, show the full column to view all attributes
pd.set_option('display.max_colwidth', None)
ds.head()

Unnamed: 0,Text,Sentiment,Source,Date/Time,User ID,Location,Confidence Score
0,I love this product!,Positive,Twitter,2023-06-15 09:23:14,@user123,New York,0.85
1,The service was terrible.,Negative,Yelp Reviews,2023-06-15 11:45:32,user456,Los Angeles,0.65
2,This movie is amazing!,Positive,IMDb,2023-06-15 14:10:22,moviefan789,London,0.92
3,I'm so disappointed with their customer support.,Negative,Online Forum,2023-06-15 17:35:11,forumuser1,Toronto,0.78
4,Just had the best meal of my life!,Positive,TripAdvisor,2023-06-16 08:50:59,foodie22,Paris,0.88


In [44]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Text              96 non-null     object 
 1   Sentiment         96 non-null     object 
 2   Source            96 non-null     object 
 3   Date/Time         96 non-null     object 
 4   User ID           96 non-null     object 
 5   Location          96 non-null     object 
 6   Confidence Score  96 non-null     float64
dtypes: float64(1), object(6)
memory usage: 5.4+ KB


In [45]:
#A We need only 2 columns for this works which are Text and Sentiment
ds = ds.drop(['Source', 'Date/Time', 'User ID', 'Location', 'Confidence Score'], axis = 1)

In [46]:
ds.head()

Unnamed: 0,Text,Sentiment
0,I love this product!,Positive
1,The service was terrible.,Negative
2,This movie is amazing!,Positive
3,I'm so disappointed with their customer support.,Negative
4,Just had the best meal of my life!,Positive


In [47]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       96 non-null     object
 1   Sentiment  96 non-null     object
dtypes: object(2)
memory usage: 1.6+ KB


In [48]:
ds['Text']

0                                               I love this product!
1                                          The service was terrible.
2                                             This movie is amazing!
3                   I'm so disappointed with their customer support.
4                                 Just had the best meal of my life!
                                   ...                              
91      Just had the most amazing vacation! I can't wait to go back.
92    The food at this restaurant was awful. Never going back again!
93        I can't stop listening to this song. It's my new favorite!
94                Their website is so confusing and poorly designed.
95    I had an incredible experience at the theme park. So much fun!
Name: Text, Length: 96, dtype: object

In [49]:
#Perform necessary pre-processing activities on textual data in the Text column

#preprocess text
# Load stopwords list

from nltk.corpus import stopwords
import re
import string

stopwords_list = set(stopwords.words('english'))

# Word tokenize
ds['Text'] = ds['Text'].apply(word_tokenize)

# Remove certain stopwords and lowercase each word
ds['Text'] = ds['Text'].apply(lambda x: [word.lower() for word in x if word.lower() not in stopwords_list])

# Convert the list of tokens back to sentences
ds['Text'] = ds['Text'].apply(lambda x: " ".join(x))

# Remove punctuations
ds['Text'] = [re.sub(r'[!@#$%^&*()_+<>?:"{}|[];~`|]', " ", text) for text in ds['Text']]
ds['Text'] = [re.sub(r'[^a-zA-Z]',' ', text) for text in ds['Text']]

# Remove numbers
ds['Text'] = [re.sub(r'[0-9]', " ", text) for text in ds['Text']]

# Remove non-ASCII characters
ds['Text'] = [re.sub(r'[^\x00-\x7f]', " ", text) for text in ds['Text']]


In [50]:
pd.set_option('display.max_colwidth', None)
ds.head()

Unnamed: 0,Text,Sentiment
0,love product,Positive
1,service terrible,Negative
2,movie amazing,Positive
3,m disappointed customer support,Negative
4,best meal life,Positive


In [51]:
ds.sample(5)

Unnamed: 0,Text,Sentiment
15,website confusing poorly designed,Negative
53,terrible experience delivery service late unprofessional,Negative
73,customer service store outstanding truly care customers,Positive
64,roller coaster theme park thrilling experience must try,Positive
89,book made feel inspired highly recommended,Positive


In [95]:
#AConvert categorical data in the Sentiment column into numerical representation

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ds['Sentiment'] = le.fit_transform(df['Sentiment'])
ds.head()

Unnamed: 0,Text,Sentiment
0,love product,1
1,service terrible,0
2,movie amazing,1
3,disappointed customer support,0
4,best meal life,1


In [96]:
#define input and output
x = ds['Text']
y = ds['Sentiment']
y

0     1
1     0
2     1
3     0
4     1
     ..
91    1
92    0
93    1
94    0
95    1
Name: Sentiment, Length: 96, dtype: int32

In [97]:
x

0                                  love product
1                              service terrible
2                                 movie amazing
3                 disappointed customer support
4                                best meal life
                        ...                    
91           amazing vacation ca n wait go back
92       food restaurant awful never going back
93        ca n stop listening song new favorite
94            website confusing poorly designed
95    incredible experience theme park much fun
Name: Text, Length: 96, dtype: object

In [98]:
#A5) Use tf-idf technique to perform word embedding on textual data in the Text column

# Vectorize the text data
vect = TfidfVectorizer().fit(x)
x = vect.transform(x)

# Convert the sparse matrix to a dense array
x_array = x.toarray()
x_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [101]:
#A7) Split the ready data into Training and Test sets

# Split dataset
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_array, y, test_size=0.2)

In [102]:
x_train.shape

(76, 229)

In [103]:
x_test.shape

(20, 229)

B) Build a classification prediction model using ANN model

In [104]:
#B3) Build an ANN classification model
# Build the model - ANN

model = Sequential() #call in the keras
model.add(Dense(128 , input_dim = 229, activation='relu')) #hidden layer one | dense=fully connected
model.add(Dense(64, activation='relu')) #hidden layer two
model.add(Dense(32, activation='relu')) #hidden layer three
model.add(Dense(2, activation = 'sigmoid')) #output 

# Prints out the model details
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               29440     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 2)                 66        
                                                                 
Total params: 39842 (155.63 KB)
Trainable params: 39842 (155.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [105]:
#specify optimezer, loss and metrics
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['acc'])

In [106]:
#train model
history = model.fit(x_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [108]:
#Evaluate train set
_, train_accuracy = model.evaluate(x_train, y_train)
print('Train Accuracy: ', train_accuracy)

Train Accuracy:  1.0


In [107]:
#Evaluate test set
_, accuracy = model.evaluate(x_test, y_test)
print('Test Accuracy: ', accuracy)

Test Accuracy:  0.8500000238418579


B4) Compare and discuss the training accuracy and testing accuracy. What hyperparameter can be used to tune the model? Give at least three. 

The accuracy score for train data set = 1.0
The accuracy score for test data set = 0.85

For this dataframe, an accuracy score of 85% gained by the test data can be considered as acceptable. However, there is a room for improvement, where we can reduce this by 
-adding more layer
-adjusting the number of nodes in every layer
-adding batch size and learning rate
-and adjusting the epochs

However, we have to take our machine source into account, as more dense layer added will consume a lot of computational power. This might/could be prevented by intorducing partially dense layer to the model. 

In [117]:
#B5) Use the model to predict the sentiment of the text :
#"The quality of customer service was exceptionally poor"

#predict for new data
prediction = model.predict(vect.transform(['The quality of customer service was exceptionally poor']).toarray())
print('Prediction: ', prediction)

#index of large value
idx = np.argmax(prediction)

#print index
print('Index: ', idx)
print('Class: ', le.inverse_transform([idx]))

Prediction:  [[0.48935848 0.4843618 ]]
Index:  0
Class:  ['Negative']


In [118]:
#B5) Use the model to predict the sentiment of the text :
#"The quality of customer service wasexceptionally poor"

#predict for new data
prediction = model.predict(vect.transform(['This book made me feel inspired. Highly recommended!']).toarray())
print('Prediction: ', prediction)

#index of large value
idx = np.argmax(prediction)

#print index
print('Index: ', idx)
print('Class: ', le.inverse_transform([idx]))

Prediction:  [[0.32663974 0.6438481 ]]
Index:  1
Class:  ['Positive']
