In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from cleantext import clean

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
path = "game_reviews/"
data = pd.read_csv(path + "labelled_all.csv")
data = data.drop('Unnamed: 0',1)
data['content'] = data['content'].astype(str)

In [3]:
data

Unnamed: 0,userName,date,content,score,if_apple,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,customer service,crush,data,system upgrad,connection,other-tech
0,Cre8tiv99,2019-03-11 17:34:05,hey guy love game yet one minor got high score...,3,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,eclaitse25,2022-04-05 22:28:14,dont want bore long review hard get new,4,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jackieee003,2020-11-19 03:12:21,game love set,5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tab the great,2021-11-13 03:55:52,game great age fun play,5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Roberts232,2018-03-08 06:26:19,special earn play,3,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,A Google user,2018-04-12 06:40:42,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,A Google user,2019-01-04 13:55:12,play blitz complaint option use gem power want,3,0,1.0,0.0,0.0,-1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,A Google user,2017-09-20 22:58:51,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,Chandresh Panchal,2020-10-30 02:53:09,chat,5,0,1.0,0.0,0.0,0.0,0.0,0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))
en_words = set(nltk.corpus.words.words())

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # removing non-English words
    text = ' '.join(word for word in text.split() if word in en_words)
    
    return text

In [5]:
data['content'] = data['content'].apply(clean_text)
data

Unnamed: 0,userName,date,content,score,if_apple,if_useful,mechanism,ad,money,UI/UX,event,keyboard,IP,time/life,customer service,crush,data,system upgrad,connection,other-tech
0,Cre8tiv99,2019-03-11 17:34:05,hey guy love game yet one minor got high score...,3,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,eclaitse25,2022-04-05 22:28:14,dont want bore long review hard get new,4,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jackieee003,2020-11-19 03:12:21,game love set,5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tab the great,2021-11-13 03:55:52,game great age fun play,5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Roberts232,2018-03-08 06:26:19,special earn play,3,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,A Google user,2018-04-12 06:40:42,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,A Google user,2019-01-04 13:55:12,play blitz complaint option use gem power want,3,0,1.0,0.0,0.0,-1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,A Google user,2017-09-20 22:58:51,love game,5,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,Chandresh Panchal,2020-10-30 02:53:09,chat,5,0,1.0,0.0,0.0,0.0,0.0,0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data["content"],data["if_useful"],test_size=0.2,shuffle=True)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 

X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [7]:
lr_tfidf=LogisticRegression()

####
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  

LogisticRegression()

In [8]:
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

         0.0       0.81      0.84      0.83       481
         1.0       0.85      0.82      0.83       519

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Confusion Matrix: [[405  76]
 [ 93 426]]
AUC: 0.9039232651949416
