In [1]:
import sys
sys.path.append('../data_cleaning/lem_stem_functions')

from text_functions_new_vocabs_ac import new_column_lemmatizer, new_column_stemmatizer, new_count_vectorize_data, new_tfidf_vectorize_data


from matplotlib import pyplot as plt

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, r2_score, mean_squared_error, classification_report
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, GridSearchCV, train_test_split

from scipy.stats import linregress

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.datasets import mnist, cifar10 
from tensorflow.keras.utils import to_categorical

import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
# import data
train = pd.read_csv('../../../data/processed/train.csv')
test = pd.read_csv('../../../data/processed/test.csv')

full_df = pd.concat([train, test], axis = 0)

print(f'train has {len(train)} records.')
print(f'test has {len(test)} records.')

print(f'datasets are joined to make one dataset again of length {len(full_df)}')

features = full_df['reviewText']
target = full_df['overall']

feature_tokens = new_column_lemmatizer(features)

feature_vectors = new_count_vectorize_data(feature_tokens)

feature_vector_array = feature_vectors[0].toarray()

normalised_array = feature_vector_array / np.max(feature_vector_array)

target_one_hot = to_categorical(target)

X_train = normalised_array[:43043, :]
X_test = normalised_array[43043:, :]
y_train = target_one_hot[:43043]
y_test = target_one_hot[43043:]

print(f'X_train shape {X_train.shape}')
print(f'X_test shape {X_test.shape}')
print(f'y_train shape {y_train.shape}')
print(f'y_test shape {y_test.shape}')

train has 43043 records.
test has 18447 records.
datasets are joined to make one dataset again of length 61490
X_train shape (43043, 44300)
X_test shape (18447, 44300)
y_train shape (43043, 6)
y_test shape (18447, 6)


In [3]:
df = pd.DataFrame(y_test)

df.value_counts()

0    1    2    3    4    5  
0.0  0.0  0.0  0.0  0.0  1.0    8340
     1.0  0.0  0.0  0.0  0.0    4774
     0.0  0.0  0.0  1.0  0.0    2580
               1.0  0.0  0.0    1477
          1.0  0.0  0.0  0.0    1276
Name: count, dtype: int64

In [4]:
inputs = Input(shape=(44300), name = 'Input')

output_length, output_units = y_train.shape

dense1 = Dense(units=512, activation='relu', name='Dense_1')
dense2 = Dense(units=256, activation='relu', name='Dense_2')
dense3 = Dense(units=128, activation='relu', name='Dense_3')
dense4 = Dense(units=64, activation='relu', name='Dense_4')
dense5 = Dense(units=32, activation='relu', name='Dense_5')
dense6 = Dense(units=16, activation='relu', name='Dense_6')
dense7 = Dense(units=8, activation='relu', name='Dense_7')

output = Dense(units=output_units, activation='softmax', name='Output')

x = dense1(inputs)
x = dense2(x)
x = dense3(x)
x = dense4(x)
x = dense5(x)
x = dense6(x)
x = dense7(x)
outputs = output(x)

model = Model(inputs=inputs, outputs=outputs)

display(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 44300)]           0         
                                                                 
 Dense_1 (Dense)             (None, 512)               22682112  
                                                                 
 Dense_2 (Dense)             (None, 256)               131328    
                                                                 
 Dense_3 (Dense)             (None, 128)               32896     
                                                                 
 Dense_4 (Dense)             (None, 64)                8256      
                                                                 
 Dense_5 (Dense)             (None, 32)                2080      
                                                                 
 Dense_6 (Dense)             (None, 16)                528   

None

In [5]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x212800b1310>

In [12]:
y_pred_10 = model.predict(X_test)

y_test_class = np.argmax(y_test, axis=1)
y_pred_10_class = np.argmax(y_pred_10, axis=1)

cnf_matrix_10 = pd.crosstab(y_test_class, y_pred_10_class, rownames=['Real'], colnames=['Predicted'])

display(cnf_matrix_10)

cnf_array_10 = np.asarray(cnf_matrix_10)

for i in range(cnf_array_10.shape[0]):
    for j in range(cnf_array_10.shape[1]):
        pred = cnf_array_10[i]
        val = pred[j]
        if val > 150 and i != j:
            print(f'{i+1} and {j+1} are often confused')



Predicted,1,2,3,4,5
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2921,870,479,157,347
2,395,322,273,135,151
3,200,209,477,335,256
4,119,104,386,1047,924
5,290,146,389,1590,5925


1 and 2 are often confused
1 and 3 are often confused
1 and 4 are often confused
1 and 5 are often confused
2 and 1 are often confused
2 and 3 are often confused
2 and 5 are often confused
3 and 1 are often confused
3 and 2 are often confused
3 and 4 are often confused
3 and 5 are often confused
4 and 3 are often confused
4 and 5 are often confused
5 and 1 are often confused
5 and 3 are often confused
5 and 4 are often confused


In [13]:
print(classification_report(y_test_class, y_pred_10_class))

              precision    recall  f1-score   support

           1       0.74      0.61      0.67      4774
           2       0.20      0.25      0.22      1276
           3       0.24      0.32      0.27      1477
           4       0.32      0.41      0.36      2580
           5       0.78      0.71      0.74      8340

    accuracy                           0.58     18447
   macro avg       0.46      0.46      0.45     18447
weighted avg       0.62      0.58      0.60     18447

