In [1]:
import numpy as np 
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras



import seaborn as sns
import random
import matplotlib.pyplot as plt

from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Vectrize text with TF-IDF(テキストをベクトル化TF-IDF）

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (1, 2)
TOP_K = 20000         # 上位2000
TOKEN_MODE = 'word'
MIN_DOC_FREQ = 2

def ngram_vectorize(texts, labels):
    
    # Parameters パラメーター
    kwargs = {
        'ngram_range' : NGRAM_RANGE,
        'dtype' : 'int32',
        'strip_accents' : 'unicode',
        'decode_error' : 'replace',
        'analyzer' : TOKEN_MODE,
        'min_df' : MIN_DOC_FREQ,
    }
    
    # Vectrization ベクトル化
    tfidf_vectorizer = TfidfVectorizer(**kwargs)
    transformed_texts = tfidf_vectorizer.fit_transform(texts)
    
    # Select best k features, with feature importance measured by f_classif
    #　f_classif を用いて上位2000 を選ぶ。
     
    
    selector = SelectKBest(f_classif, k=min(TOP_K, transformed_texts.shape[1]))
    selector.fit(transformed_texts, labels)
    
    transformed_texts = selector.transform(transformed_texts).astype('float32')
    return transformed_texts
# Vectorize the data
vect_df = ngram_vectorize(df['review'], df['sentiment'])



In [7]:
vect_df.shape

(50000, 20000)

In [8]:
tfidf = TfidfVectorizer()

tr_texts = tfidf.fit_transform(df['review'])
tr_texts.shape

(50000, 101895)

In [9]:
from sklearn.model_selection import train_test_split

# Split data to target (y) and features (X)
X = vect_df.toarray()
y = (np.array(df['sentiment']))

# Here we split data to training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=13)
print("Train dataset shape: {0}, \nTest dataset shape: {1}".format(X_train.shape, X_test.shape))

Train dataset shape: (25000, 20000), 
Test dataset shape: (25000, 20000)


In [10]:
y_train.shape

(25000,)

In [11]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [12]:
X_train.shape

(25000, 20000)

In [13]:
X_train.shape[1:]

(20000,)

# Modeling

In [14]:
input_shape = X_train.shape[1:]

In [15]:
model = tf.keras.Sequential()





model.add(tf.keras.layers.Dense(1000,input_shape=input_shape, activation='relu'))

model.add(tf.keras.layers.Dropout(rate=0.2))

model.add(tf.keras.layers.Dense(units=300,activation='relu'))

model.add(tf.keras.layers.Dense(units=50,activation='relu'))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))


model.summary()




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              20001000  
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               300300    
_________________________________________________________________
dense_2 (Dense)              (None, 50)                15050     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 20,316,401
Trainable params: 20,316,401
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])

In [17]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), 
                    verbose=1, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
