<a href="https://colab.research.google.com/github/SilahicAmil/NLP-NLTK/blob/main/More_Sentiment_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# More Sentiment Practice

In [29]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Tensorflow
import tensorflow as tf

# Keras
import keras
from keras.layers import Embedding, TextVectorization
from keras import models

In [None]:
# Import Datasets
train_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/test.csv")

# Visualize Datasets

In [None]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Preprocess (train and test)

In [None]:
train_df = train_df.drop("selected_text", 1)


KeyError: ignored

In [None]:
train_df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [8]:
train_df = train_df[train_df.sentiment != 'neutral']


In [9]:
train_df.head()

Unnamed: 0,textID,text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,positive


In [11]:
mapping = {"positive": 1, "negative": 0}

processed_train_df = train_df.replace({'sentiment': mapping})

In [13]:
processed_train_df.head()

Unnamed: 0,textID,text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0
2,088c60f138,my boss is bullying me...,0
3,9642c003ef,what interview! leave me alone,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,1


# Shuffle Training Data

In [14]:
shuffled_train_df = processed_train_df.sample(frac=1, random_state=42)

In [15]:
shuffled_train_df.head()

Unnamed: 0,textID,text,sentiment
14813,508435b32d,thanks will try to behave,1
18134,de3833a81d,Today is lame because I am not in Orlando I a...,0
9074,85a65f0656,hahahaha! i laughed my **** off just now. thanks,1
23126,d70ed4038d,Gahh ! This weather sucksss !,0
16935,1b4d9119e9,Fallen in love with enter shikari again. Might...,1


# Creating Validation Set (train_test_split)

In [17]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(shuffled_train_df["text"].to_numpy(),
                                                                            shuffled_train_df["sentiment"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [18]:
len(train_sentences), len(val_sentences)

(14726, 1637)

# Convert Text > Numbers (vectorization)

In [21]:
# Average len
round(sum([len(i.split()) for i in train_sentences]))/len(train_sentences)

13.318823848974603

In [22]:
MAX_VOCAB = 10000
MAX_LEN = 13

txt_vect = TextVectorization(max_tokens=MAX_VOCAB,
                             output_mode="int",
                             output_sequence_length=MAX_LEN)

In [25]:
txt_vect.adapt(train_sentences)

# Embedding Layer

In [26]:
embedding = tf.keras.layers.Embedding(input_dim=MAX_VOCAB,
                                      output_dim=128,
                                      input_length=MAX_LEN)

In [27]:
random_sentence = random.choice(train_sentences)
random_sentence

'wow its follow friday and i havent tweeted... Fail. And nobody has followed me today  dble fail. *suicide*'

In [28]:
embedded_sentence = embedding(txt_vect([random_sentence])) # Needs a list of the sentence
embedded_sentence

<tf.Tensor: shape=(1, 13, 128), dtype=float32, numpy=
array([[[ 0.03068854,  0.04627648, -0.00137649, ..., -0.0451445 ,
          0.03069801,  0.04735099],
        [ 0.02906448,  0.01446975,  0.01299318, ..., -0.01092215,
         -0.02635449, -0.00631744],
        [-0.00657355,  0.04809314, -0.00089008, ..., -0.03809202,
          0.04431906, -0.04825329],
        ...,
        [-0.0343985 , -0.01915753,  0.02527075, ...,  0.04830853,
          0.02712614,  0.02993912],
        [ 0.04143072, -0.03321661,  0.0093466 , ..., -0.00721288,
         -0.01814216,  0.04733082],
        [ 0.04422525,  0.0150501 ,  0.04147685, ...,  0.04871105,
         -0.02299867, -0.0399917 ]]], dtype=float32)>

# Baseline Model (SkLearn Pipeline)

In [30]:
# Creating model
model_1 = Pipeline([
  ("Tfidf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

In [32]:
# Compile model
model_1.fit(train_sentences,
            train_labels)

Pipeline(steps=[('Tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

# Evaluate/ Predictions

In [34]:
model_1.score(val_sentences, val_labels)

0.8643860720830788

# Simple Dense Model (Functional API)

In [35]:
# Inputs
inputs = keras.layers.Input(shape=(1, ), dtype="string")

# Vectorization 
x = txt_vect(inputs)

# Embedding
x = embedding(x)

# Average Pooling
x= keras.layers.GlobalAveragePooling1D()(x)

# Dense
outputs = keras.layers.Dense(1, activation="sigmoid")(x)

# Model Creation
model_2 = keras.Model(inputs, outputs)

# Compile and Fit Model

In [36]:
model_2.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [39]:
hist_2 = model_2.fit(train_sentences,
            train_labels,
            epochs=5,
            validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate/ Predictions

In [40]:
loss, accuracy = model_2.evaluate(val_sentences, val_labels)
print(f"Loss: {loss}\nAccuracy: {accuracy}")

Loss: 0.4888102412223816
Accuracy: 0.8259010314941406


In [42]:
model_2_preds = model_2.predict(val_sentences)
model_2_preds

array([[9.7490573e-01],
       [9.9837399e-01],
       [7.4700356e-02],
       ...,
       [5.0345097e-05],
       [9.4456422e-01],
       [1.6344756e-02]], dtype=float32)