<a href="https://colab.research.google.com/github/SilahicAmil/NLP-NLTK/blob/main/Sentiment_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Practice


In [46]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Tensorflow
import tensorflow as tf

# Keras
import keras
from keras.layers import Embedding, TextVectorization


In [2]:
# Import Dataset
train_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Tweet_Sentiment/tweet-sentiment-extraction/test.csv")

## Visualize Datasets


In [3]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Preprocess (train and test data)

In [5]:
train_df = train_df.drop("selected_text", 1)
train_df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [9]:
# Removing neutral
train_df = train_df[train_df.sentiment != "neutral"]

In [10]:
# Mapping the pos and neg to 0 and 1
mapping = {"positive": 1, "negative": 0}

train_df_copy = train_df.replace({'sentiment': mapping})
train_df_copy.head()

Unnamed: 0,textID,text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0
2,088c60f138,my boss is bullying me...,0
3,9642c003ef,what interview! leave me alone,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,1


## Shuffle Data (training set)

In [11]:
shuffled_train_df = train_df_copy.sample(frac=1, random_state=42)

In [12]:
shuffled_train_df[:5]

Unnamed: 0,textID,text,sentiment
14813,508435b32d,thanks will try to behave,1
18134,de3833a81d,Today is lame because I am not in Orlando I a...,0
9074,85a65f0656,hahahaha! i laughed my **** off just now. thanks,1
23126,d70ed4038d,Gahh ! This weather sucksss !,0
16935,1b4d9119e9,Fallen in love with enter shikari again. Might...,1


In [13]:
train_df_copy.sentiment.value_counts()

1    8582
0    7781
Name: sentiment, dtype: int64

## Create validation set (train test split)

In [15]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(shuffled_train_df["text"].to_numpy(),
                                                                            shuffled_train_df["sentiment"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [17]:
len(train_sentences), len(val_sentences)

(14726, 1637)

In [18]:
train_sentences[:5]

array(['Great find   http://amanita-design.net/samorost-1/',
       ' everyone loves u sarah not just the tweeters! the today show couldnt stop raving about your beautifulness...',
       'Hello dark hair! Well, my plans for today just got cancelled  make some better ones?!',
       'just watched devil wears prada, and i want to live in new york city more than ever. why must i be so poor',
       'Is feeling really bad about goofin` on  not knowing she really wasn`t at the meeting! Dang I`m sorry! Me & my big `ol mouth'],
      dtype=object)

In [19]:
val_sentences[:5]

array([' good question. Nepal PM declaring to resign and actually resigning is two very different things .',
       '  i just saw your performance on the ellen show. we`re so behind in australia  you were AMAZING and have such a wonderful voice!',
       ' nooo... as a font connoisseur i can totally relate dude, my heart goes out to you',
       ' i really am going to miss you',
       'Ahhh I`m engulfed in shooooes. Who told me to buy so maany **** shoes'],
      dtype=object)

## Convert Text > Numbers (Vectorization) (Train Sentences)

Standardize each example (usually lowercasing + punctuation stripping)

Split each example into substrings (usually words)

Recombine substrings into tokens (usually ngrams)

Index tokens (associate a unique int value with each token)

Transform each example using this index, either into a vector of ints or a dense float vector.

In [28]:
round(sum([len(i.split()) for i in train_sentences]))/len(train_sentences)


13.318823848974603

In [29]:
MAX_VOCAB_LEN = 10000
MAX_LEN = 13

txt_vect = TextVectorization(max_tokens=MAX_VOCAB_LEN,
                             output_mode="int",
                             output_sequence_length=MAX_LEN)

In [30]:
# Fitting txt_vect on training set

txt_vect.adapt(train_sentences)

In [34]:
# Unique words
word_in_vocab = txt_vect.get_vocabulary()
top_5 = word_in_vocab[:5]
bottom_5 = word_in_vocab[-5:]

top_5

['', '[UNK]', 'i', 'to', 'the']

In [35]:
bottom_5

['reschedule', 'rerun', 'reripped', 'rereading', 'reread']

In [38]:
# Create an Embedding layer (TF)

embedding = tf.keras.layers.Embedding(input_dim=MAX_VOCAB_LEN,
                                      output_dim=128,
                                      input_length=MAX_LEN)

In [41]:
rand_sent = random.choice(train_sentences)
print(f"{rand_sent}")

hi guys just doing da usael notmuch really! <3 sad me


In [43]:
sample_embed = embedding(txt_vect([rand_sent]))
sample_embed

<tf.Tensor: shape=(1, 13, 128), dtype=float32, numpy=
array([[[-0.01315792,  0.03495939, -0.01433147, ..., -0.00439936,
          0.02953706,  0.04016193],
        [-0.00876824,  0.01592309, -0.02381196, ..., -0.00058044,
         -0.03793532,  0.00801433],
        [-0.03511863, -0.04569632, -0.00376624, ..., -0.01324267,
          0.03347857,  0.00536916],
        ...,
        [ 0.00697124, -0.04660774, -0.03552817, ...,  0.02475018,
         -0.01090373,  0.04079973],
        [-0.00243551, -0.01333498,  0.02051303, ...,  0.024986  ,
         -0.01131428,  0.03434305],
        [-0.00243551, -0.01333498,  0.02051303, ...,  0.024986  ,
         -0.01131428,  0.03434305]]], dtype=float32)>

In [47]:
# Baseline model with Sklearn (Tfidf and Multinomial)

model_1 = Pipeline([
  ("tfidf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

In [48]:
# Fit model
model_1.fit(train_sentences,
            train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

## Evaluate and make preds


In [49]:
model_1.score(val_sentences, val_labels)

0.8643860720830788

In [50]:
model_1.predict(val_sentences)

array([1, 1, 0, ..., 0, 0, 0])