<a href="https://colab.research.google.com/github/SilahicAmil/NLP-NLTK/blob/main/More_Sentiment_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# More Sentiment Practice

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Tensorflow
import tensorflow as tf

# Keras
import keras
from keras.layers import Embedding, TextVectorization
from keras import models

# SpaCy
import spacy as spacy

In [2]:
# Import Datasets
train_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/test.csv")

# Visualize Datasets

In [3]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Preprocess (train and test)

In [5]:
train_df = train_df.drop("selected_text", 1)


  """Entry point for launching an IPython kernel.


In [6]:
train_df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [7]:
train_df = train_df[train_df.sentiment != 'neutral']


In [8]:
train_df.head()

Unnamed: 0,textID,text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,positive


In [9]:
mapping = {"positive": 1, "negative": 0}

processed_train_df = train_df.replace({'sentiment': mapping})

In [10]:
processed_train_df.head()

Unnamed: 0,textID,text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0
2,088c60f138,my boss is bullying me...,0
3,9642c003ef,what interview! leave me alone,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,1


# Shuffle Training Data

In [11]:
shuffled_train_df = processed_train_df.sample(frac=1, random_state=42)

In [12]:
shuffled_train_df.head()

Unnamed: 0,textID,text,sentiment
14813,508435b32d,thanks will try to behave,1
18134,de3833a81d,Today is lame because I am not in Orlando I a...,0
9074,85a65f0656,hahahaha! i laughed my **** off just now. thanks,1
23126,d70ed4038d,Gahh ! This weather sucksss !,0
16935,1b4d9119e9,Fallen in love with enter shikari again. Might...,1


# Creating Validation Set (train_test_split)

In [13]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(shuffled_train_df["text"].to_numpy(),
                                                                            shuffled_train_df["sentiment"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [14]:
len(train_sentences), len(val_sentences)

(14726, 1637)

# Convert Text > Numbers (vectorization)

In [15]:
# Average len
round(sum([len(i.split()) for i in train_sentences]))/len(train_sentences)

13.318823848974603

In [16]:
MAX_VOCAB = 10000
MAX_LEN = 13

txt_vect = TextVectorization(max_tokens=MAX_VOCAB,
                             output_mode="int",
                             output_sequence_length=MAX_LEN)

In [17]:
txt_vect.adapt(train_sentences)

# Embedding Layer

In [18]:
embedding = tf.keras.layers.Embedding(input_dim=MAX_VOCAB,
                                      output_dim=128,
                                      input_length=MAX_LEN)

In [19]:
random_sentence = random.choice(train_sentences)
random_sentence

'I hate it when my sweetie has a bad day  http://tinyurl.com/lr22dj'

In [20]:
embedded_sentence = embedding(txt_vect([random_sentence])) # Needs a list of the sentence
embedded_sentence

<tf.Tensor: shape=(1, 13, 128), dtype=float32, numpy=
array([[[ 0.03180913, -0.04834057, -0.00864853, ...,  0.03412869,
         -0.00952818, -0.03271765],
        [ 0.03941114, -0.03083943, -0.04216595, ..., -0.03190871,
          0.00053363,  0.02262882],
        [ 0.03658542, -0.04219136,  0.01568412, ..., -0.04828456,
          0.03594631, -0.03655627],
        ...,
        [-0.01807171,  0.00894552,  0.02736989, ..., -0.00441159,
         -0.0047582 , -0.02836152],
        [-0.00515107,  0.04362699,  0.0119793 , ...,  0.03096518,
         -0.01954148,  0.00785927],
        [-0.00515107,  0.04362699,  0.0119793 , ...,  0.03096518,
         -0.01954148,  0.00785927]]], dtype=float32)>

# Baseline Model (SkLearn Pipeline)

In [21]:
# Creating model
model_1 = Pipeline([
  ("Tfidf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

In [22]:
# Compile model
model_1.fit(train_sentences,
            train_labels)

Pipeline(steps=[('Tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

# Evaluate/ Predictions

In [23]:
model_1.score(val_sentences, val_labels)

0.8643860720830788

# Simple Dense Model (Functional API)

In [24]:
# Inputs
inputs = keras.layers.Input(shape=(1, ), dtype="string")

# Vectorization 
x = txt_vect(inputs)

# Embedding
x = embedding(x)

# Average Pooling
x= keras.layers.GlobalAveragePooling1D()(x)

# Dense
outputs = keras.layers.Dense(1, activation="sigmoid")(x)

# Model Creation
model_2 = keras.Model(inputs, outputs)

# Compile and Fit Model

In [25]:
model_2.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [26]:
hist_2 = model_2.fit(train_sentences,
            train_labels,
            epochs=5,
            validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate/ Predictions

In [27]:
loss, accuracy = model_2.evaluate(val_sentences, val_labels)
print(f"Loss: {loss}\nAccuracy: {accuracy}")

Loss: 0.3411526679992676
Accuracy: 0.8448381423950195


In [28]:
model_2_preds = model_2.predict(val_sentences)
model_2_preds

array([[0.9673082 ],
       [0.96872306],
       [0.10746039],
       ...,
       [0.00221068],
       [0.7766442 ],
       [0.11417459]], dtype=float32)

# SpaCy Testing



In [29]:
import spacy

In [30]:
nlp_md = spacy.load("en_core_web_md")



In [54]:
text = shuffled_train_df["text"]
text = text.to_string()

In [58]:
doc1 = list(nlp_md(text))


In [59]:
for token in doc1:
  print(token)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


14575
   
wow
tiz
almost
midnite
o_O
bedtime
for
me
!
!
ha
...


3043
            
wants
to
go
out
tonight
but
ca
nt
get
home


2866
         
Wishing
all
MOMs
a
very
Happy
Mother`s
Day
!
!


4897
    
Justin`s
blanket
shed
black
lint
all
over
my
w
...


19113
   
1
more
goal
,
Gila
!
!
But
his
last
game
is
again
...


13978
   
Hmm
,
$
25
to
see
the
Decemberists
,
but
I
have
t
...


23149
   
HAPPY
MOTHER
DAY
TO
ALL
THE
MOTHER`S
IN
THE
EN
...


3284
     
ScREW
MY
PHONE
.
ITS
BROKEN
.
DONT
BOTHER
TEXTING
.


16103
    
Rest
is
important
,
but
like
everything
else
d
...


10963
    
Kennedy
was
re
-
injured
at
RAW
on
Monday
.
He`s
...


23834
                
Wearing
glasses
gives
me
a
headache
.


11871
    
wtf
kinda
best
friend
am
I
?
I
*
still
*
haven`t
...


4715
                 
de
wereld
need
more
ppl
like
you
!
;)


1082
                                       
has
heart
burn


16803
    
ze
Franz
has
not
friend