In [1]:
import tensorflow as tf

In [2]:
import nltk
import numpy as np
import pandas as pd
import io
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from google.colab import files
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import regularizers
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##**Collecting Data**

In [None]:
uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded["IMDB Dataset.csv"]))

In [4]:
data = data[:8000]

In [5]:
print(data.head())
print(data.shape)
print(data['sentiment'].value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(8000, 2)
positive    4003
negative    3997
Name: sentiment, dtype: int64


Dataset is balanced.



In [6]:
reviews = data['review']
print(reviews.head())
print(reviews.shape)
sentiment = data['sentiment']
print(sentiment.head())
print(sentiment.shape)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object
(8000,)
0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object
(8000,)


PreProcessing The Data For ML

In [7]:
def preprocess_text(review):
    #remove html tags
    soup = BeautifulSoup(review, 'html.parser')
    review_with_no_html = soup.get_text()

    words = word_tokenize(review_with_no_html)
    #remove punctuation
    punctuation = set(string.punctuation)
    words = [word for word in words if word not in punctuation]
    #remove stopwords
    sw = set(stopwords.words("english"))
    words = [word for word in words if word.lower() not in sw]
    #remove non-alphabetic words
    words = [word for word in words if word.isalpha()]
    #using SnowBallStemmer to stem each word
    snowball = SnowballStemmer(language = 'english')
    stemmed_words = [snowball.stem(word) for word in words]

    preprocessed_review = ' '.join(stemmed_words)
    return preprocessed_review

In [8]:
reviews = reviews.apply(preprocess_text)
print(reviews.head())
print(reviews.shape)

  soup = BeautifulSoup(review, 'html.parser')


0    one review mention watch oz episod hook right ...
1    wonder littl product film techniqu fashion giv...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
Name: review, dtype: object
(8000,)


Prepare Data For Neutral Network

In [9]:
bag_of_words_tool = CountVectorizer(max_df=0.8)
reviews_with_bagOfWords = bag_of_words_tool.fit_transform(reviews)
print(reviews_with_bagOfWords.shape)
print(type(reviews_with_bagOfWords))

(8000, 30955)
<class 'scipy.sparse._csr.csr_matrix'>


In [10]:
temp = pd.DataFrame.sparse.from_spmatrix(reviews_with_bagOfWords)
print(temp.shape)
print(type(temp))

(8000, 30955)
<class 'pandas.core.frame.DataFrame'>


In [11]:
label_encoder = LabelEncoder()
encoded_sentiment = label_encoder.fit_transform(sentiment)
print(encoded_sentiment.shape)
print(encoded_sentiment)

(8000,)
[1 1 1 ... 1 0 0]


In [12]:
x_train, x_test, y_train, y_test = train_test_split(temp, encoded_sentiment, random_state=3, train_size=0.70)
print('training_set x size: ', x_train.shape)
print('training_set y size: ', y_train.shape)
print('testing_set x size: ', x_test.shape)
print('testing_set y size: ', y_test.shape)

training_set x size:  (5600, 30955)
training_set y size:  (5600,)
testing_set x size:  (2400, 30955)
testing_set y size:  (2400,)


Time To Apply Neutral Network

In [31]:
model = tf.keras.Sequential ([
    tf.keras.layers.Dense (32, activation = 'relu', input_shape=(x_train.shape[1],), kernel_regularizer=regularizers.l2(0.30)),
    tf.keras.layers.Dense (2, activation = 'softmax')
])

In [32]:
model.compile (tf.keras.optimizers.Adam (learning_rate = 0.0001),
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits =True),
               metrics = ['accuracy'])

In [33]:
model.summary ()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 32)                990592    
                                                                 
 dense_13 (Dense)            (None, 2)                 66        
                                                                 
Total params: 990,658
Trainable params: 990,658
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit (x_train, y_train, epochs = 2)

In [None]:
model.evaluate (x_test, y_test)