### Import basic library

In [24]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Logistic Classification libs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix

#Neuron Network libs
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
uploaded = files.upload()

Saving cmt-dataset.csv to cmt-dataset (2).csv


In [10]:
df = pd.read_csv('cmt-dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,user,playtime,user_url,post_date,helpfulness,review,recommend
0,1,Rodrigo,2627.2,https://steamcommunity.com/id/RodrigoBosch27/,"December 10, 2020",1824,after years playing it i didn t improve my ski...,Recommended
1,2,Erudite Skald,8.3,https://steamcommunity.com/id/EruditeRocker/,"March 14, 2023",3081,see a guy shoot him miss every shot he turns a...,Recommended
2,3,Nymeria_,20.9,https://steamcommunity.com/profiles/7656119815...,"June 28, 2022",1450,this community is so nice i got a lot of tips ...,Recommended
3,4,mint,8112.5,https://steamcommunity.com/id/waIdek/,"January 22, 2023",2706,your team in every random competitive game you...,Recommended
4,7,weevil,214.6,https://steamcommunity.com/id/pikiru/,"October 19, 2022",1063,this community is so nice i got a lot of tips ...,Recommended


In [11]:
df['recommend'].value_counts()

Unnamed: 0_level_0,count
recommend,Unnamed: 1_level_1
Not Recommended,8427
Recommended,6417


In [12]:
df['recommend'] = df['recommend'].map({'Recommended':'1', 'Not Recommended':'0'})
df.head()

Unnamed: 0.1,Unnamed: 0,user,playtime,user_url,post_date,helpfulness,review,recommend
0,1,Rodrigo,2627.2,https://steamcommunity.com/id/RodrigoBosch27/,"December 10, 2020",1824,after years playing it i didn t improve my ski...,1
1,2,Erudite Skald,8.3,https://steamcommunity.com/id/EruditeRocker/,"March 14, 2023",3081,see a guy shoot him miss every shot he turns a...,1
2,3,Nymeria_,20.9,https://steamcommunity.com/profiles/7656119815...,"June 28, 2022",1450,this community is so nice i got a lot of tips ...,1
3,4,mint,8112.5,https://steamcommunity.com/id/waIdek/,"January 22, 2023",2706,your team in every random competitive game you...,1
4,7,weevil,214.6,https://steamcommunity.com/id/pikiru/,"October 19, 2022",1063,this community is so nice i got a lot of tips ...,1


In [14]:
data_columns =['playtime', 'helpfulness','recommend']
df[data_columns].head()

Unnamed: 0,playtime,helpfulness,recommend
0,2627.2,1824,1
1,8.3,3081,1
2,20.9,1450,1
3,8112.5,2706,1
4,214.6,1063,1


### Start with Logistic Classification

In [23]:
X_text = df['review']
X_numberic = df[['playtime', 'helpfulness']]
y = df['recommend']
# Text Preprocessing
tfidf = TfidfVectorizer(stop_words='english', max_df = .7)
X_text_tfidf = tfidf.fit_transform(X_text)

# Numberical Feature Preprocessing
scaler = StandardScaler()

# Impute NaN values with the mean
imputer = SimpleImputer(strategy='mean') # Create an imputer instance
X_numberic = imputer.fit_transform(X_numberic) # Fit and transform the data
X_numberic_scaled = scaler.fit_transform(X_numberic) # Now scale the imputed data

# Check for NaN in X_text_tfidf and X_numberic_scaled
print(f"NaN in X_text_tfidf: {np.isnan(X_text_tfidf.data).any()}")
print(f"NaN in X_numberic_scaled: {np.isnan(X_numberic_scaled).any()}")

X = hstack([X_text_tfidf, X_numberic_scaled])
X = csr_matrix(X)

# Check for NaN in X
print(f"NaN in X: {np.isnan(X.data).any()}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

NaN in X_text_tfidf: False
NaN in X_numberic_scaled: False
NaN in X: False
Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1679
           1       0.83      0.74      0.78      1290

    accuracy                           0.82      2969
   macro avg       0.82      0.81      0.81      2969
weighted avg       0.82      0.82      0.82      2969



### Start with Neuron Network

In [26]:
# Text preprocessing: Tokenize and pad sequences
max_words = 10000
max_len = 100  # Maximum length of each comment

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['review'])

X_text = tokenizer.texts_to_sequences(df['review'])
X_text_padded = pad_sequences(X_text, maxlen=max_len)

# Numerical feature preprocessing: Standardize numerical features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(df[['playtime', 'helpfulness']])
df['recommend'] = pd.to_numeric(df['recommend'], errors='coerce')
# Split the data into train and test sets
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text_padded, X_numeric_scaled, df['recommend'], test_size=0.2, random_state=42)

# Define the neural network architecture
# Text branch (LSTM)
input_text = Input(shape=(max_len,))
embedding = Embedding(max_words, 128)(input_text)
lstm = LSTM(64)(embedding)

# Numerical branch (Dense Layer for numerical features)
input_numeric = Input(shape=(X_numeric_scaled.shape[1],))
dense_numeric = Dense(32, activation='relu')(input_numeric)

# Concatenate both branches
concatenated = Concatenate()([lstm, dense_numeric])
output = Dense(1, activation='sigmoid')(concatenated)

# Build and compile the model
model = Model(inputs=[input_text, input_numeric], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_text_train, X_numeric_train], y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate([X_text_test, X_numeric_test], y_test)
print(f'Accuracy: {accuracy:.2f}')


Epoch 1/5
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 81ms/step - accuracy: 0.7020 - loss: nan - val_accuracy: 0.5802 - val_loss: nan
Epoch 2/5
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.5677 - loss: nan - val_accuracy: 0.5802 - val_loss: nan
Epoch 3/5
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 105ms/step - accuracy: 0.5652 - loss: nan - val_accuracy: 0.5802 - val_loss: nan
Epoch 4/5
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 77ms/step - accuracy: 0.5666 - loss: nan - val_accuracy: 0.5802 - val_loss: nan
Epoch 5/5
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 79ms/step - accuracy: 0.5628 - loss: nan - val_accuracy: 0.5802 - val_loss: nan
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5668 - loss: nan
Accuracy: 0.57


._.
