In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from cleantext import clean

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn import svm
from sklearn.metrics import classification_report, f1_score, accuracy_score, recall_score, confusion_matrix, precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import random

In [2]:
path = "game_reviews/"
data = pd.read_csv(path + "labelled_all.csv")
data = data.drop('Unnamed: 0',1)
data['content'] = data['content'].astype(str)

In [3]:
data

Unnamed: 0,userName,date,content,score,if_apple,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,customer service,crush,data,system upgrad,connection,other-tech
0,Cre8tiv99,2019-03-11 17:34:05,"Hey guys, this is Bryce “Cre8tiv” Demby. Love ...",3,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,eclaitse25,2022-04-05 22:28:14,I don't want to bore you with a long review lo...,4,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jackieee003,2020-11-19 03:12:21,Fav game!! Love the challenges and Disney sett...,5,1,1.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tab the great,2021-11-13 03:55:52,This game is great for all ages and fun to play!,5,1,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Roberts232,2018-03-08 06:26:19,Special Disney emoji earned through play.,3,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,A Google user,2018-04-12 06:40:42,I love this game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,A Google user,2019-01-04 13:55:12,I have been playing Emoji Blitz for awhile now...,3,0,1.0,0.0,0.0,-1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,A Google user,2017-09-20 22:58:51,Love this game!!!,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,Chandresh Panchal,2020-10-30 02:53:09,How to chat with Disney emojis,5,0,1.0,0.0,0.0,0.0,0.0,0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))
en_words = set(nltk.corpus.words.words())

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # removing non-English words
    text = ' '.join(word for word in text.split() if word in en_words)
    
    return text

In [5]:
data['content'] = data['content'].apply(clean_text)
data

Unnamed: 0,userName,date,content,score,if_apple,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,customer service,crush,data,system upgrad,connection,other-tech
0,Cre8tiv99,2019-03-11 17:34:05,hey guy love game yet one minor got high score...,3,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,eclaitse25,2022-04-05 22:28:14,dont want bore long review hard get new,4,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jackieee003,2020-11-19 03:12:21,game love set,5,1,1.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tab the great,2021-11-13 03:55:52,game great age fun play,5,1,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Roberts232,2018-03-08 06:26:19,special earn play,3,1,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,A Google user,2018-04-12 06:40:42,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,A Google user,2019-01-04 13:55:12,play blitz complaint option use gem power want,3,0,1.0,0.0,0.0,-1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,A Google user,2017-09-20 22:58:51,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,Chandresh Panchal,2020-10-30 02:53:09,chat,5,0,1.0,0.0,0.0,0.0,0.0,0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# get training, validation, and test set
X_train, X_test, y_train, y_test = train_test_split(data[["content","if_apple"]],data["if_useful"],test_size = 0.2,shuffle = True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train ,test_size = 0.25,shuffle = True)

X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train['content'])
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val['content'])
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test['content'])

df_X_train = pd.DataFrame(X_train_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
df_X_train['if_apple'] = X_train['if_apple']
df_X_val = pd.DataFrame(X_val_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
df_X_val['if_apple'] = X_val['if_apple']
df_X_test = pd.DataFrame(X_test_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
df_X_test['if_apple'] = X_test['if_apple']

In [6]:
# get training, validation, and test set
X_train, X_test, y_train, y_test = train_test_split(data[["content"]],data["if_useful"],test_size = 0.2,shuffle = True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train ,test_size = 0.25,shuffle = True)

X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train['content'])
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val['content'])
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test['content'])

df_X_train = pd.DataFrame(X_train_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
#df_X_train[["if_useful","if_apple"]] = X_train[["if_useful","if_apple"]]
df_X_val = pd.DataFrame(X_val_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
#df_X_val[["if_useful"]] = X_val[["if_useful"]]
df_X_test = pd.DataFrame(X_test_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
#df_X_test[["if_useful"]] = X_test[["if_useful"]]

In [7]:
def train_models(X_train, y_train, model):
    if model == "random_forest":
        clf = RandomForestClassifier()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == "SVM":
        clf = svm.SVC()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == "logistic_regression":
        clf = LogisticRegression()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == "SGDClassifier":
        clf = SGDClassifier()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == "naive_bayes":
        clf = MultinomialNB()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == "DecisionTree":
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train)
        return clf

In [8]:
def baseline(X_train, y_train, X_test, y_test):
    seq = y_train.unique()
    prob = []
    for i in range(len(seq)):
        curr_value = seq[i]
        prob.append(len([i for i in y_train if i == curr_value]) / len(y_train))
    acc_scores = []
    for i in range(1,10000):
        random_choice = random.choices(seq, weights = prob, k = len(y_test))
        acc_scores.append(accuracy_score(y_test, random_choice))
        
    return np.mean(acc_scores)

In [9]:
def n_trails(X_train, y_train, X_test, y_test, model, n = 10):
    trail = 0
    accuracy_scores = []
    macro_precision = []
    micro_precision = []
    macro_recall = []
    micro_recall = []
    macro_f = []
    micro_f = []
    while trail < n:
        clf = train_models(X_train, y_train, model)
        y_predict = clf.predict(X_test)
        accuracy_scores.append(accuracy_score(y_test, y_predict))
        macro_precision.append(precision_score(y_test, y_predict, average = 'macro'))
        micro_precision.append(precision_score(y_test, y_predict, average = "micro"))
        macro_f.append(f1_score(y_test, y_predict, average = "macro"))
        micro_f.append(f1_score(y_test, y_predict, average = "micro"))
        macro_recall.append(recall_score(y_test, y_predict, average = 'macro'))
        micro_recall.append(recall_score(y_test, y_predict, average = "micro"))
        trail += 1
    scores = [np.mean(accuracy_scores), np.mean(macro_precision), np.mean(micro_precision),
             np.mean(macro_recall), np.mean(micro_recall), np.mean(macro_f), np.mean(micro_f)]
    
    return scores

In [10]:
def get_score_df(X_train, y_train, X_test, y_test):
    baseline_score = baseline(X_train, y_train, X_test, y_test)
    print(f"Random choice accuracy score = {baseline_score}")
    
    model_dict = dict()
    models = ['logistic_regression', 'random_forest', 'SVM', 'SGDClassifier', 'naive_bayes']
    scores_name = ['accuracy_scores','macro_precision', 'micro_precision', 'macro_recall',
                  'micro_recall', 'macro_f', 'micro_f']
    
    for i in models:
        model_dict[i] = n_trails(df_X_train, y_train, df_X_val, y_val, model = i)
    
    df = pd.DataFrame(model_dict)
    df.index = scores_name
    
    return df.T

In [11]:
df = get_score_df(df_X_train, y_train, df_X_val, y_val)
df

Random choice accuracy score = 0.5001979197919793


Unnamed: 0,accuracy_scores,macro_precision,micro_precision,macro_recall,micro_recall,macro_f,micro_f
logistic_regression,0.849,0.84916,0.849,0.849021,0.849,0.848988,0.849
random_forest,0.8439,0.844042,0.8439,0.843889,0.8439,0.84388,0.8439
SVM,0.85,0.855164,0.85,0.849879,0.85,0.849421,0.85
SGDClassifier,0.8403,0.840983,0.8403,0.840334,0.8403,0.840227,0.8403
naive_bayes,0.792,0.796583,0.792,0.791875,0.792,0.791145,0.792


In [12]:
model = "random_forest"
clf = train_models(df_X_train, y_train, model)
y_predict = clf.predict(df_X_test)
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
scores = n_trails(df_X_train, y_train, df_X_test, y_test, model)
scores

Confusion Matrix: [[419  83]
 [ 65 433]]


[0.8474,
 0.8476592015876477,
 0.8474,
 0.8474519592313478,
 0.8474,
 0.8473831133369079,
 0.8474]

In [13]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

         0.0       0.87      0.83      0.85       502
         1.0       0.84      0.87      0.85       498

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



In [129]:
ood = pd.read_excel("ood.xlsx")
ood = ood.drop(['Unnamed: 0'],1)
ood

Unnamed: 0,if_apple,score,content,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,crush,data,system upgrad,connection
0,apple,5,I love this game.It is so much fun to get Disn...,1,0,0,0,0,0,0,0,0,0,0,0,0
1,apple,5,It incredible,0,0,0,0,0,0,0,0,0,0,0,0,0
2,apple,5,I enjoy almost anything Disney and getting to ...,1,0,0,-1,0,0,0,0,0,0,0,0,0
3,apple,5,I play this every single day ♥️,0,0,0,0,0,0,0,0,0,0,0,0,0
4,apple,5,Love that you can earn emojis to use in your t...,1,0,0,0,0,0,0,0,0,0,0,0,0
5,apple,3,I love this game but I wish you would stop req...,1,0,0,0,0,0,0,0,0,0,0,0,0
6,google,1,"I've noticed how cheap, not inexpensive, this ...",1,0,0,0,0,0,0,0,0,0,0,0,0
7,google,3,fun but greatly flawed theres two main issues ...,1,0,0,0,0,0,0,0,0,0,0,0,0
8,google,1,"Was a great game, not so much anymore. When ga...",1,0,0,0,0,0,0,0,0,0,0,0,0
9,google,1,I don't know what has changed but it freezes o...,1,0,0,0,0,0,0,0,0,0,0,0,0


In [133]:
apple = []
for i in list(ood['if_apple']):
    if i == "apple":
        apple.append(1)
    else:
        apple.append(0)

In [135]:
ood['if_apple'] = apple

In [144]:
ood_vectors_tfidf = tfidf_vectorizer.transform(ood['content'])

In [145]:
df_ood = pd.DataFrame(ood_vectors_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
df_ood

Unnamed: 0,aa,absurd,accept,access,accident,accomplish,account,across,act,action,...,year,yesterday,yesteryear,yet,yo,youd,younger,your,yr,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.496597,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
df_ood = df_ood.join(ood[['if_useful','if_apple']])

In [147]:
clf = train_models(df_X_train, y_train, "DecisionTree")
clf.predict(df_ood)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.])

In [148]:
ood['other-tech'] = clf.predict(df_ood)

In [149]:
ood.to_excel("ood.xlsx")

Unnamed: 0,if_apple,score,content,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,crush,data,system upgrad,connection,other-tech
0,1,5,I love this game.It is so much fun to get Disn...,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,1,5,It incredible,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2,1,5,I enjoy almost anything Disney and getting to ...,1,0,0,-1,0,0,0,0,0,0,0,0,0,0.0
3,1,5,I play this every single day ♥️,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,1,5,Love that you can earn emojis to use in your t...,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
5,1,3,I love this game but I wish you would stop req...,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
6,0,1,"I've noticed how cheap, not inexpensive, this ...",1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
7,0,3,fun but greatly flawed theres two main issues ...,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
8,0,1,"Was a great game, not so much anymore. When ga...",1,0,0,0,0,0,0,0,0,0,0,0,0,0.0
9,0,1,I don't know what has changed but it freezes o...,1,0,0,0,0,0,0,0,0,0,0,0,0,-1.0
