<a href="https://colab.research.google.com/github/Turkinass/Sentiment_Analyzer/blob/main/Sentiment_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SIC Capstone Project - The Unit (Group 4)
# **Sentiment Analyzer**

### Installing required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from google.colab import drive

In [None]:
df_master = pd.read_csv('https://raw.githubusercontent.com/Turkinass/Sentiment_Analyzer/main/Tweets.csv')


In [None]:
df_master.sample(5)

In [None]:
df_master.airline_sentiment.value_counts()

### Data Preprocessing

In [None]:
print('Number of duplicated values: ', df_master.duplicated().sum())

In [None]:
df_master.drop_duplicates(inplace = True)

In [None]:
print('Number of duplicated values: ', df_master.duplicated().sum())

In [None]:
df_master.head()

In [None]:
df_master.shape

In [None]:
# tokenizing the sentiments
sentiment_conditions = [
    (df_master['airline_sentiment'] == 'negative'),
    (df_master['airline_sentiment'] == 'neutral'),
    (df_master['airline_sentiment'] == 'positive')
    ]

values = [0, 1, 2]

df_master['label'] = np.select(sentiment_conditions, values)

In [None]:
df_master.label.value_counts()

In [None]:
# removing unneeded columns
df_master.drop(df_master.columns.difference(['airline_sentiment','text', 'label']), 1, inplace=True)

In [None]:
df_master.head(200)

In [None]:
df_master.label.value_counts()

### Data Modelling

In [None]:
X = df_master['text']
y = df_master['label']

In [None]:
#Combine two dataframes (Done)
#labeling (Done)
#Drop unneeded columns (Done)
#Tokenazation (Done)
#Splitting (Done)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)



In [None]:
# hyperparameters
epochs = 30
vocab_size = 10000
max_length = 100
n_dim = 16
training_size = X_train.shape[0]

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>') #replaces words absent from corpus with <OOV> to keep text length
tokenizer.fit_on_texts(X_train)

# representing each text as a sequence of tokens
train_sequence = tokenizer.texts_to_sequences(X_train)

# padding each sequence with trailing zeros so that all sequence have same length
train_pad = pad_sequences(train_sequence, maxlen=max_length, padding='post', truncating='post')

test_sequence = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_sequence, maxlen=max_length, padding='post', truncating='post')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, n_dim, input_length=max_length),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(1, activation='relu')
])
model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(train_pad, y_train, epochs=epochs, validation_data=(test_pad, y_test), verbose=2)

### The results

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.legend(['Accuracy', 'Validation accuracy'])
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('Loss')
plt.legend(['Loss','Validation loss'])
plt.show()