<a href="https://colab.research.google.com/github/RihabTsi/Project-Data-Science/blob/main/IMBD_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import re


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from textblob import TextBlob
from textblob import Word
import nltk

# Setup
!pip install -q wordcloud
import wordcloud

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Read file with labels

In [4]:
df = pd.read_csv("/content/drive/MyDrive/ESG_BIG_DATA/Machine Learning/TP1 Critique IMBS/IMBD.csv")
df.head()

Unnamed: 0,Critiques,Label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [5]:
df.rename(columns = {'Critiques':'text'}, inplace = True)


In [6]:
df

Unnamed: 0,text,Label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [7]:
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
train_ds=df[:17500]
train_ds.text[0]

'A delightful story about two evacuees, has been turned into a nice little film, by the BBC. Most children who like a good story will enjoy this. The characters are played really well by a very good cast. Not sure whether our American friends will appreciate it, but they do get a mention, as Aunty Lou runs off with a gorgeous American soldier.'

In [9]:
test_ds = df[17500:]
test_ds = test_ds.reset_index(drop=True)
test_ds.head()

Unnamed: 0,text,Label
0,This movie isn't worth going to the theaters t...,0
1,This movie was not very good in my opinion. Wh...,0
2,"""Lights of New York"" originally started out as...",1
3,Admirable but weak James Bond film mainly beca...,0
4,Quentin in my opinion has written and directed...,0


# Embedding **layer**

In [10]:
# Embed a 1,000 words/tokens vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(1000, 5)
# We create a random list of three integers and use it as input for the embedding
# layer and take a look at the output.
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

# The result is a collection of three 5-dimensional vectors. Each element in the 
# original list has been replaced by a vector of 5 floating points.

array([[-0.02673417, -0.00708727,  0.01658465, -0.04912126,  0.01368606],
       [ 0.01869703,  0.04622107, -0.00688421,  0.02454608,  0.01546058],
       [ 0.03208982, -0.03843373,  0.03708104,  0.02728727,  0.00488452]],
      dtype=float32)

In [11]:
# The shape of the trainable_variables attribute confirms what we just explained.
embedding_layer.trainable_variables[0].shape

TensorShape([1000, 5])

# **Pre-Processing** 

In [12]:
# We start by downloading spacy for the english language
!python -m spacy download en_core_web_sm -q

2022-12-03 20:36:02.332386: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 12.8 MB 5.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [14]:
# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

In [15]:
#Cleaning TEXT
def sentence_rge(data):
    data=re.sub('<[^>]*>','',data) # Html taglerini kaldırma
    emoji=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',data) #Emojileri bulma
    data=re.sub('[\W]+',' ',data.lower()) +\
                ' '.join(emoji).replace('-','')
    return data

In [16]:
print("Text before cleaning:: ",train_ds.text[0])
print("Text after cleaning:: ", train_ds.text.apply(sentence_rge)[0])


Text before cleaning::  A delightful story about two evacuees, has been turned into a nice little film, by the BBC. Most children who like a good story will enjoy this. The characters are played really well by a very good cast. Not sure whether our American friends will appreciate it, but they do get a mention, as Aunty Lou runs off with a gorgeous American soldier.
Text after cleaning::  a delightful story about two evacuees has been turned into a nice little film by the bbc most children who like a good story will enjoy this the characters are played really well by a very good cast not sure whether our american friends will appreciate it but they do get a mention as aunty lou runs off with a gorgeous american soldier 


In [17]:
#Clean the columns text
train_ds['text_clean']=train_ds["text"].apply(sentence_rge)
# Remove all non alphanumeric characters except whitespaces
train_ds["text_clean"] = train_ds['text_clean'].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
train_ds["text_clean"] = train_ds['text_clean'].apply(lambda x: x.replace(" +"," ").lower().strip())
# remove stop words and replace everyword with their lemma
train_ds["text_clean"] = train_ds["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

In [18]:
train_ds

Unnamed: 0,text,Label,text_clean
0,"A delightful story about two evacuees, has bee...",1,delightful story evacuee turn nice little film...
1,"Having already seen the original ""Jack Frost"",...",0,having original jack frost think jack frost 2 ...
2,I have nothing but praise for this movie and c...,1,praise movie cast especially ann margaret impo...
3,"Changi has a delightfully fresh script, acted ...",1,changi delightfully fresh script act superbly ...
4,"I say Ben Johnson and my fellow Canadians say,...",0,ben johnson fellow canadian ben johnson goddam...
...,...,...,...
17495,Usual awful movie... I'll not bother you about...,0,usual awful movie ll bother synopsis core arma...
17496,I've just revisited this fondly remembered bit...,1,ve revisit fondly remember bit cinematic madne...
17497,I want very much to believe that the above quo...,0,want believe quote specifically english subtit...
17498,Everyone's already commented on the obvious fa...,0,s comment obvious fact comment obviously peopl...


In [19]:
#Train_TEST
test_ds["text_clean"] = test_ds["text"].apply(sentence_rge)
test_ds["text_clean"] = test_ds["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [20]:
test_ds

Unnamed: 0,text,Label,text_clean
0,This movie isn't worth going to the theaters t...,0,movie isn t worth theater watch didn t like ef...
1,This movie was not very good in my opinion. Wh...,0,movie good opinion complete waste hour half lu...
2,"""Lights of New York"" originally started out as...",1,light new york originally start experimental...
3,Admirable but weak James Bond film mainly beca...,0,admirable weak james bond film mainly hero bon...
4,Quentin in my opinion has written and directed...,0,quentin opinion write direct good movie multip...
...,...,...,...
7495,My wife and I saw every episode in this series...,0,wife episode series love series cut short fina...
7496,"Target is the story of a special agent who, af...",0,target story special agent carry order assassi...
7497,One of the most underrated comedies. Dan Akroy...,1,underrated comedy dan akroyd hilarious role ch...
7498,This is the sort of unknown and forgotten film...,1,sort unknown forget film dream discover watch ...


# **Tokenizer Text**

In [21]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) # instanciate the tokenizer
# num_words indicates the number of words to keep in the tokenization
# keeps only the most common words

tokenizer.fit_on_texts(train_ds.text_clean) # fit the tokenizer on the texts
# in this step the tokenizer will list all unique tokens in the text
# and associate them with a specific integer.

# This step will effectively transform the texts into sequences of indices
train_ds["text_encoded"] = tokenizer.texts_to_sequences(train_ds.text_clean)

# Sometimes the preprocessing removes all the words in a string (because they contain
# only stopwords for example) so we calculate the length in order to filter out
# those records
train_ds["len_text"] = train_ds["text_encoded"].apply(lambda x: len(x))
train_ds = train_ds[train_ds["len_text"]!=0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [22]:
train_ds

Unnamed: 0,text,Label,text_clean,text_encoded,len_text
0,"A delightful story about two evacuees, has bee...",1,delightful story evacuee turn nice little film...,"[1955, 11, 70, 195, 34, 3, 1550, 110, 5, 6, 11...",26
1,"Having already seen the original ""Jack Frost"",...",0,having original jack frost think jack frost 2 ...,"[530, 100, 449, 2305, 12, 449, 2305, 99, 1551,...",80
2,I have nothing but praise for this movie and c...,1,praise movie cast especially ann margaret impo...,"[1495, 2, 58, 123, 2093, 4068, 2613, 1495, 796...",105
3,"Changi has a delightfully fresh script, acted ...",1,changi delightfully fresh script act superbly ...,"[5362, 4687, 1123, 88, 36, 3026, 59, 41, 28, 2...",87
4,"I say Ben Johnson and my fellow Canadians say,...",0,ben johnson fellow canadian ben johnson goddam...,"[1151, 1956, 1405, 1999, 1151, 1956, 2, 57, 39...",96
...,...,...,...,...,...
17495,Usual awful movie... I'll not bother you about...,0,usual awful movie ll bother synopsis core arma...,"[499, 235, 2, 102, 626, 3236, 1338, 4957, 325,...",65
17496,I've just revisited this fondly remembered bit...,1,ve revisit fondly remember bit cinematic madne...,"[38, 4971, 8631, 183, 84, 1019, 2380, 151, 62,...",112
17497,I want very much to believe that the above quo...,0,want believe quote specifically english subtit...,"[31, 96, 1536, 2952, 423, 1319, 3262, 49, 81, ...",213
17498,Everyone's already commented on the obvious fa...,0,s comment obvious fact comment obviously peopl...,"[1, 261, 439, 69, 261, 357, 17, 3351, 3, 79, 1...",153


In [23]:
# We do the same on the test set, except we do not fit the tokenizer this time
test_ds["text_encoded"] = tokenizer.texts_to_sequences(test_ds.text_clean)
test_ds["len_text"] = test_ds["text_encoded"].apply(lambda x: len(x))
test_ds = test_ds[test_ds["len_text"]!=0]

In [24]:
test_ds

Unnamed: 0,text,Label,text_clean,text_encoded,len_text
0,This movie isn't worth going to the theaters t...,0,movie isn t worth theater watch didn t like ef...,"[2, 87, 4, 134, 434, 9, 44, 4, 5, 97, 2, 85, 1...",40
1,This movie was not very good in my opinion. Wh...,0,movie good opinion complete waste hour half lu...,"[2, 6, 418, 363, 153, 161, 147, 3109, 44, 4, 3...",122
2,"""Lights of New York"" originally started out as...",1,light new york originally start experimental...,"[320, 60, 736, 1698, 56, 3848, 2776, 135, 680,...",192
3,Admirable but weak James Bond film mainly beca...,0,admirable weak james bond film mainly hero bon...,"[5061, 561, 433, 862, 3, 1154, 294, 862, 560, ...",94
4,Quentin in my opinion has written and directed...,0,quentin opinion write direct good movie multip...,"[4879, 418, 81, 196, 6, 2, 2139, 744, 129, 247...",65
...,...,...,...,...,...
7495,My wife and I saw every episode in this series...,0,wife episode series love series cut short fina...,"[176, 124, 82, 19, 82, 277, 135, 285, 124, 435...",25
7496,"Target is the story of a special agent who, af...",0,target story special agent carry order assassi...,"[1389, 11, 149, 875, 550, 352, 6411, 3459, 173...",177
7497,One of the most underrated comedies. Dan Akroy...,1,underrated comedy dan akroyd hilarious role ch...,"[2080, 65, 1917, 376, 52, 1140, 46, 622, 6, 32...",22
7498,This is the sort of unknown and forgotten film...,1,sort unknown forget film dream discover watch ...,"[238, 1347, 312, 3, 488, 549, 9, 41, 206, 721,...",88


In [25]:
tokenizer.index_word

{1: 's',
 2: 'movie',
 3: 'film',
 4: 't',
 5: 'like',
 6: 'good',
 7: 'time',
 8: 'character',
 9: 'watch',
 10: 'bad',
 11: 'story',
 12: 'think',
 13: 'scene',
 14: 'great',
 15: 'know',
 16: 'look',
 17: 'people',
 18: 'don',
 19: 'love',
 20: 'way',
 21: 'play',
 22: 'thing',
 23: 'come',
 24: 'find',
 25: 'man',
 26: 'end',
 27: 'life',
 28: 'actor',
 29: 'plot',
 30: 'work',
 31: 'want',
 32: 'year',
 33: 'try',
 34: 'little',
 35: 'feel',
 36: 'act',
 37: 'm',
 38: 've',
 39: 'guy',
 40: 'lot',
 41: 'old',
 42: 'director',
 43: 'real',
 44: 'didn',
 45: 'funny',
 46: 'performance',
 47: 'doesn',
 48: 'woman',
 49: 'actually',
 50: 'big',
 51: '10',
 52: 'role',
 53: 'long',
 54: 'leave',
 55: 'tell',
 56: 'start',
 57: 'star',
 58: 'cast',
 59: 'young',
 60: 'new',
 61: 'horror',
 62: 'day',
 63: 'world',
 64: 'point',
 65: 'comedy',
 66: 'girl',
 67: 'minute',
 68: 'pretty',
 69: 'fact',
 70: 'turn',
 71: 'acting',
 72: 'music',
 73: 'happen',
 74: 'action',
 75: 'line',
 76: 

In [26]:
train_pad = tf.keras.preprocessing.sequence.pad_sequences(train_ds.text_encoded, padding="post")
test_pad = tf.keras.preprocessing.sequence.pad_sequences(test_ds.text_encoded, padding="post")

In [27]:
# We'll use this to form a tensorflow dataset containing on the one hand
# the encoded texts and the labels.
train_ds = tf.data.Dataset.from_tensor_slices((train_pad, train_ds.Label))
test_ds = tf.data.Dataset.from_tensor_slices((test_pad, test_ds.Label))

# We then organize the dataste per batch
train_ds = train_ds.shuffle(len(train_ds)).batch(1024)

test_ds = test_ds.shuffle(len(test_ds)).batch(1024)

# **Model Prediction**

In [28]:
embedding_dim=16 # the dimensionality of the representation space

vocab_size = tokenizer.num_words # the number of words in the vocabulary
model = Sequential([
  Embedding(vocab_size, embedding_dim, name="embedding"), # the embedding layer
  # the input dim needs to be equal to the size of the vocabulary + 1 (because of
  # the zero padding)
  GlobalAveragePooling1D(), # this will pick the average for every word in the sentence
  # along each dimension of the representation space.
  Dense(16, activation='relu'), # a dense layer
  Dense(1, activation="sigmoid") # the prediction layer
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [52]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs2")


In [53]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [54]:
model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f7da926a3d0>

In [48]:
#writer = tf.summary.create_file_writer("/content/logs2")

In [55]:
#docs_infra: no_execute
#%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [57]:
#tensorboard --log_dir logs2

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
Error: A logdir or db must be specified. For example `tensorboard --logdir mylogdir` or `tensorboard --db sqlite:~/.tensorboard.db`. Run `tensorboard --helpfull` for details and examples.

In [61]:
import os
import io
vocab = [value for value in tokenizer.index_word.values()][:1000]
weights = model.get_layer('embedding').get_weights()[0]

log_dir = "/content/logs2"
os.makedirs(log_dir, exist_ok=True)
out_v = io.open(log_dir+"/vectors.tsv", 'w', encoding='utf-8')
out_m = io.open(log_dir+"/metadata.tsv", 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()