In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN
import warnings
warnings.filterwarnings('ignore')

In [18]:
## load the imdb dataset
max_features = 10000 ## vocabulary size
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [19]:
## inspect sample review
sample_review = x_train[0]      ## embedding layers (one hot representation)
sample_label = y_train[0]       ## sentiment for the sample review
print('-'*50)
print(f'Sample Review ===> {sample_review}')
print('-'*50)
print(f'Length of the Sample Review ===> {len(sample_review)}')
print('-'*50)
print(f'Sentiment ===> {sample_label}')
print('-'*50)

--------------------------------------------------
Sample Review ===> [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19,

In [42]:
len(x_train), len(x_test), len(x_train) + len(x_test)

(25000, 25000, 50000)

In [47]:
x = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [49]:
len(x), len(y)

(50000, 50000)

In [51]:
word_index = imdb.get_word_index()
reverse_word_index = {value : key for key, value in word_index.items()}

In [53]:
len(reverse_word_index)

88584

In [60]:
decoded_reviews = []
for rev in x:
    decode_review = ' '.join([reverse_word_index.get(i-3, '?') for i in rev])
    decoded_reviews.append(decode_review)

In [65]:
df = {
    "Reviews" : decoded_reviews,
    "Sentiments" : y
}

In [66]:
data = pd.DataFrame(df)

In [68]:
data.head()

Unnamed: 0,Reviews,Sentiments
0,? this film was just brilliant casting locatio...,1
1,? big hair big boobs bad music and a giant saf...,0
2,? this has to be one of the worst films of the...,0
3,? the ? ? at storytelling the traditional sort...,1
4,? worst mistake of my life br br i picked this...,0


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Reviews     50000 non-null  object
 1   Sentiments  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [70]:
data.to_csv("imbd_sentiments.csv")