# Model training

In this notebook we are going to apply a couple of binary classification models. We will try, if my machine allows it, to create a sentiment model using machine learning techniques and another using a neural network.

###  Logistic Regression
The first model we are going to use is a classical logistic regression model. It is usually a model that is implemented quickly and often gives positive results.

For this model, we need to create a feature set on which to deploy the model. We can use Word2Vec or a TF-IDF. In this case we are going to apply a TF-IDF since, in the first notebook, we saw how a Word2Vec would be implemented. In addition, this one exceeded the memory of the computer.

In [2]:
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve

import matplotlib.pyplot as plt

In [3]:
#load dataframe
%run Two_Preprocessing.ipynb

words_process

Read preprocessed data from cache file: preprocessed_data.pkl


[['installing',
  'struggle',
  'games',
  'windows',
  'live',
  'bugs',
  'championship',
  'races',
  'cars',
  'unlocked',
  'buying',
  'addon',
  'paid',
  'nearly',
  '30',
  'dollars',
  'new',
  'like',
  'idea',
  'keep',
  'paying',
  'keep',
  'playing',
  'noticed',
  'improvement',
  'physics',
  'graphics',
  'compared',
  'dirt',
  '2',
  'tossed',
  'garbage',
  'vowed',
  'never',
  'buy',
  'another',
  'codemasters',
  'really',
  'tired',
  'arcade',
  'style',
  'rally',
  'racing',
  'games',
  'anyway',
  'continue',
  'get',
  'fix',
  'richard',
  'burns',
  'rally',
  'http',
  'www',
  'amazon',
  'com',
  'richard',
  'burns',
  'rally',
  'pc',
  'dp',
  'b000c97156',
  'ref',
  'sr',
  '1',
  '1',
  'ie',
  'utf8',
  'qid',
  '1341886844',
  'sr',
  '8',
  '1',
  'keywords',
  'richard',
  'burns',
  'rallythank',
  'reading',
  'review',
  'enjoyed',
  'sure',
  'rate',
  'helpful'],
 ['1st',
  'shipment',
  'received',
  'book',
  'instead',
  '2nd',
  

In [4]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    words_process,
    labels,
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)

In [5]:
print(len(X_train))
print(len(X_test))

42774
14258


In [5]:
#Now we can extract the features for our TF-IDF
cv = TfidfVectorizer(
    ngram_range=(2, 3),
    strip_accents='ascii',
    max_df=0.95,
    min_df=3,
    max_features=5000
)
cv.fit(X_train)

TfidfVectorizer(max_df=0.95, max_features=5000, min_df=3, ngram_range=(2, 3),
                strip_accents='ascii')

In [6]:
X_train_ = cv.transform(X_train)
X_test_ = cv.transform(X_test)

In [7]:
#Now, we can create our Logistic Regression Model
c_params = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 1000]

train_acc = list()
test_acc = list()
for c in c_params:
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=10000)
    lr.fit(X_train_, y_train)
    
    train_predict = lr.predict(X_train_)
    test_predict = lr.predict(X_test_)
    
    print ("Accuracy for C={}: {}".format(c, accuracy_score(y_test, test_predict)))
    
    train_acc.append(accuracy_score(y_train, train_predict))
    test_acc.append(accuracy_score(y_test, test_predict))

Accuracy for C=0.01: 0.7937298358816103
Accuracy for C=0.05: 0.8069855519708234
Accuracy for C=0.25: 0.8233973909384206
Accuracy for C=0.5: 0.8273250105204096
Accuracy for C=1: 0.8294290924393323
Accuracy for C=10: 0.8177163697573292
Accuracy for C=100: 0.8085285453780334
Accuracy for C=1000: 0.8066348716510029


In [8]:
#Best c_param aprox = 1
c_params = [0.8, 0.9, 0.95, 1, 1.05, 1.10, 1.2, 2]

train_acc = list()
test_acc = list()
for c in c_params:
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=10000)
    lr.fit(X_train_, y_train)
    
    train_predict = lr.predict(X_train_)
    test_predict = lr.predict(X_test_)
    
    print ("Accuracy for C={}: {}".format(c, accuracy_score(y_test, test_predict)))
    
    train_acc.append(accuracy_score(y_train, train_predict))
    test_acc.append(accuracy_score(y_test, test_predict))

Accuracy for C=0.8: 0.82921868424744
Accuracy for C=0.9: 0.8289381399915837
Accuracy for C=0.95: 0.8287978678636555
Accuracy for C=1: 0.8294290924393323
Accuracy for C=1.05: 0.8299200448870809
Accuracy for C=1.1: 0.8300603170150092
Accuracy for C=1.2: 0.8300603170150092
Accuracy for C=2: 0.8281666432879787


In [9]:
#Best c_param = 1.2
train_acc = list()
test_acc = list()

lr = LogisticRegression(C=1.2, solver='lbfgs', max_iter=10000)
lr.fit(X_train_, y_train)
    
train_predict = lr.predict(X_train_)
test_predict = lr.predict(X_test_)
    
print ("Accuracy = {}".format(accuracy_score(y_test, test_predict)))
    
train_acc.append(accuracy_score(y_train, train_predict))
test_acc.append(accuracy_score(y_test, test_predict))

Accuracy = 0.8300603170150092


Let's create some metrics to analyze in the next module.

In [10]:
print('Confussion matrix:\n{}'.format(confusion_matrix(y_test, test_predict)))
print('\nClassification report:\n{}'.format(classification_report(y_test, test_predict)))
print('Accuracy score:{}'.format(accuracy_score(y_test, test_predict)))

Confussion matrix:
[[5953 1098]
 [1325 5882]]

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      7051
           1       0.84      0.82      0.83      7207

    accuracy                           0.83     14258
   macro avg       0.83      0.83      0.83     14258
weighted avg       0.83      0.83      0.83     14258

Accuracy score:0.8300603170150092


### LSTM model
We are going to bild a LSTM. I have made this decision because LSTMs should, in theory, remember longer sequences than GRUs and outperform them in tasks requiring modeling long-distance relations.This can be useful for some reviews that contain a several mount of tokens.

In [6]:
import gensim
import multiprocessing as mp

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
)
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [7]:
# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 10
# 32
W2V_EPOCH = 5
W2V_MIN_COUNT = 1

# KERAS
#SEQUENCE_LENGTH = 500

In [8]:
def generate_tokenizer(train_df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_df)
    vocab_size = len(tokenizer.word_index) + 1
    print(f"Total words: {vocab_size}")
    return tokenizer, vocab_size

In [9]:
def generate_word2vec(train_df):
    
    w2v_model = gensim.models.word2vec.Word2Vec(
        vector_size=W2V_SIZE,
        window=W2V_WINDOW,
        min_count=W2V_MIN_COUNT,
        workers=mp.cpu_count(),
    )
    w2v_model.build_vocab(train_df)

    word2id = w2v_model.wv.key_to_index
    id2word = {i: word for word, i in word2id.items()}
    vocab_size = len(word2id)
    print(f"Vocab size: {vocab_size}")
    w2v_model.train(train_df, total_examples=len(train_df), epochs=W2V_EPOCH)

    return w2v_model

In [36]:
def generate_embedding(word2vec_model, vocab_size, tokenizer):
    embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
    for word, i in tokenizer.word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return Embedding(
        vocab_size,
        W2V_SIZE,
        weights=[embedding_matrix],
        input_length=SEQUENCE_LENGTH,
        trainable=False,
    )

In [10]:
tokenizer, vocab = generate_tokenizer(X_train)

Total words: 83269


In [11]:
word2vec_model = generate_word2vec(X_train)

Vocab size: 83268


In [12]:
from keras.preprocessing import sequence

word2id = word2vec_model.wv.key_to_index
max_words = 300
id2word = {i: word for word, i in word2id.items()}

X_train_vec = [[word2id.get(word) for word in review] for review in X_train]
X_train_vectorized = sequence.pad_sequences(X_train_vec, maxlen=max_words)

word2vec_model.build_vocab(X_test, update=True)
word2id = word2vec_model.wv.key_to_index
id2word = {i: word for word, i in word2id.items()}
X_test_vect = [[word2id.get(word) for word in review] for review in X_test]
X_test_vectorized = sequence.pad_sequences(X_test_vect, maxlen=max_words)

In [12]:
X_test_vectorized

array([[    0,     0,     0, ...,  6000,  3421,   308],
       [    0,     0,     0, ...,  1542,  2193,   457],
       [    0,     0,     0, ...,    72,    67,  1809],
       ...,
       [    0,     0,     0, ...,   531,   913,   147],
       [    0,     0,     0, ...,  1252,   712, 11676],
       [ 1276,  4841,   831, ...,   238,   390,  3470]])

In [13]:
word2vec_model_weights  = word2vec_model.wv.vectors

#embedding_layer = generate_embedding(word2vec_model, vocab, tokenizer)

In [56]:
X_train_vectorized.shape

(42774, 300)

In [45]:
word2vec_model_weights.shape

(97255, 300)

In [46]:
type(word2vec_model_weights)

numpy.ndarray

In [53]:
word2vec_model_weights.shape

(97255, 300)

In [54]:
type(word2vec_model_weights[0][0])

numpy.float32

In [1]:
import tensorflow as tf
tf.version.VERSION

'2.8.0'

In [19]:
with tf.device('gpu:0'):
    model_custom = Sequential()
    model_custom.add(Embedding(input_dim=word2vec_model_weights.shape[0], output_dim=word2vec_model_weights.shape[1], 
                            weights=[word2vec_model_weights]))
    model_custom.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model_custom.add(Dense(1, activation="sigmoid"))
    model_custom.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
batch_size = 64
num_epochs = 3

X_train_words_valid, y_valid =X_train_vectorized[:batch_size], y_train[:batch_size]  # first batch_size samples
X_train_words2, y_train2 = X_train_vectorized[batch_size:], y_train[batch_size:]  # rest for training

model_custom.fit(X_train_words2, y_train2,
          validation_data=(X_train_words_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1e29abffdf0>