In [2]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

### Load the dataset

In [None]:
df = pd.read_csv('../Datasets/train/steam_ds.csv')
df.head()

### Loading the preprocessed dataset for faster computation
#### Because loading raw text data and preprocessing takes alot of time

In [39]:
df = pd.read_csv('../Datasets/train/steam_ds_preprocessed.csv')
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,scared hear creepy voice pause moment writ...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,good game Sam Pepper YouTube account 10/10what...,Positive
2,3,Spooky's Jump Scare Mansion,2016.0,littly iffy control know play easy master ' ve...,Positive
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun colorful note windowed mode com...,Positive
4,5,Spooky's Jump Scare Mansion,2015.0,game cute tag right horror tag steam play game...,Positive


In [42]:
df = df[df.user_review.isna() == False]

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17489 entries, 0 to 17493
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review_id        17489 non-null  int64  
 1   title            17489 non-null  object 
 2   year             17311 non-null  float64
 3   user_review      17489 non-null  object 
 4   user_suggestion  17489 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 819.8+ KB


### Load the spacy library

In [None]:
nlp = spacy.load('en_core_web_lg')

### Initialize functions

In [None]:
def lemma(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [None]:
def stopword(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

In [None]:
def remove_punctuation(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

In [5]:
def vectorize(data,tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [6]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

### Apply the functions to our dataset

In [None]:
df['user_review'] = df['user_review'].apply(remove_punctuation)
df['user_review'] = df['user_review'].apply(stopword)
df['user_review'] = df['user_review'].apply(lemma)

In [None]:
#df.to_csv('../Datasets/train/steam_ds_preprocessed.csv', index=False)

### Check the data

In [None]:
df.head()

### Selecting the necessary attributes for our task

In [44]:
features = df[['user_review']]
labels = df[['user_suggestion']]

### Split the dataset into Train and Test

In [45]:
X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size = 0.15, random_state=42)

### Check data distribution

In [46]:
print("Data distribution:\n- Train: {} \n- Test: {}".format(len(y_train),len(y_test)))

Data distribution:
- Train: 14865 
- Test: 2624


### Convert the text data into Tfidf embeddings

In [47]:
tfidf_vect = TfidfVectorizer()
tfidf_vect_fit=tfidf_vect.fit(X_train['user_review'])
X_train=vectorize(X_train['user_review'],tfidf_vect_fit)

### Initialize Random Forrest Classifier with CV

In [48]:
rf_cv = RandomForestClassifier()
scores = cross_val_score(rf_cv,X_train,y_train.values.ravel(),cv=10, verbose = 1, n_jobs=4)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 45.4min finished


In [49]:
print(scores)
scores.mean()

[0.79623403 0.80094149 0.82784129 0.82044385 0.83254876 0.83310902
 0.81695828 0.79542396 0.82503365 0.8115747 ]


0.8160109011160882

In [50]:
rf_cv.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
X_test=vectorize(X_test['user_review'],tfidf_vect_fit)


In [52]:
y_pred = rf_cv.predict(X_test)

In [53]:
accuracy_cv = round(accuracy_score(y_test,y_pred), 3) *100
print(accuracy_cv)
recall_cv = round(recall_score(y_test,y_pred, pos_label="Positive"), 3) *100
print(recall_cv)
precision_cv = round(precision_score(y_test,y_pred, pos_label="Positive"), 3) *100
print(precision_cv)
F1_cv = round(f1_score(y_test,y_pred, pos_label="Positive"), 3) *100
print(F1_cv)
print(classification_report(y_test,y_pred))

83.6
91.10000000000001
82.6
86.6
              precision    recall  f1-score   support

    Negative       0.85      0.73      0.79      1091
    Positive       0.83      0.91      0.87      1533

    accuracy                           0.84      2624
   macro avg       0.84      0.82      0.83      2624
weighted avg       0.84      0.84      0.83      2624



### Applying Grid Search CV

In [None]:
rf_GCV = RandomForestClassifier()

param_grid = {'max_depth': [10,20,40,50],
              'min_samples_leaf': [1,2,3],
              'max_features': [1,2,5,10],
              'criterion': ['gini', 'entropy'],
              'n_estimators': [1,200,500,1000],
              'bootstrap': [True, False]}

cv = GridSearchCV(rf_GCV,param_grid,verbose = 2, n_jobs=4, cv=10)
cv.fit(X_train,y_train.values.ravel())
print_results(cv)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 13.3min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 47.6min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 109.3min


In [None]:
cv.best_estimator_

In [None]:
rf2 = RandomForestClassifier(n_estimators=100,max_depth=None)
rf2.fit(X_train, y_train.values.ravel())

In [None]:
X_test=vectorize(X_test['lemmaText'],tfidf_vect_fit)

y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred), 3)
accuracy