In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import *
import re
from nltk.tokenize import TweetTokenizer
import string

In [2]:
df = pd.read_csv('threads_reviews.csv', header = 'infer')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32910 entries, 0 to 32909
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   source              32910 non-null  object
 1   review_description  32910 non-null  object
 2   rating              32910 non-null  int64 
 3   review_date         32910 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB


In [4]:
df.dropna(inplace = True)

In [5]:
df.source.value_counts()

Google Play    30270
App Store       2640
Name: source, dtype: int64

In [6]:
data = df[(df.rating == 1) | (df.rating == 2) | (df.rating == 4) | (df.rating == 5)][['review_description','rating']]

In [7]:
data.reset_index(inplace = True, drop = True)

In [8]:
data['rating'] = data['rating'].replace(1, 0)
data['rating'] = data['rating'].replace(2, 0)

In [9]:
data['rating'] = data['rating'].replace(4, 1)
data['rating'] = data['rating'].replace(5, 1)

In [10]:
data.rating.value_counts()

1    18803
0    11522
Name: rating, dtype: int64

In [11]:
data['review_description'][0]

"Meh. Not the greatest experience on a Chromebook. Seems to be customized for phones only. Opens in a little screen that you can't expand or resize - for reasons that are a complete mystery to me. Judging from the fact that every other app I know of is resizeable, this seems like it was a conscious choice by the developers . Why you'd do something like this is beyond understanding and suggests a control freak approach. Not a great way to make a first impression."

In [12]:
def process_review(rvw):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    rvw = re.sub(r'\$\w*', '', rvw)
    
    rvw = re.sub(r'^RT[\s]+', '', rvw)
    
    rvw = re.sub(r'https?:\/\/.*[\r\n]*', '', rvw)
    
    rvw = re.sub(r'#', '', rvw)
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    rvw_tokens = tokenizer.tokenize(rvw)

    rvw_clean = []
    for word in rvw_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            rvw_clean.append(stem_word)

    return rvw_clean

In [13]:
def build_freqs(rvw, ys):
    # Convert np array to list since zip needs an iterable.
    yslist = np.squeeze(ys).tolist()
    #Count freqs and generate dictionary
    freqs = {}
    for y, rw in zip(yslist, rvw):
        for word in process_review(rw):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [14]:
freqs = build_freqs(data.review_description,data.rating)

In [15]:
len(freqs)

20103

In [16]:
def extract_features(rvw, freqs, process_review=process_review):
    word_l = process_review(rvw)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    for word in word_l:
        
        x[0,1] += freqs.get((word, 1.0),0)
        
        x[0,2] += freqs.get((word, 0),0)
        
    assert(x.shape == (1, 3))
    return x

In [17]:
X = np.zeros((len(data), 3)) 

for i in range(len(data)):
    X[i,:] = extract_features(data.review_description[i],freqs,process_review=process_review)

In [18]:
y = data.rating

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,stratify = y)

In [20]:
logReg = LogisticRegression()
gauNB = GaussianNB()
svc=SVC()
dectree=DecisionTreeClassifier()
ranfor=RandomForestClassifier()
adaBC=AdaBoostClassifier()
models=[logReg,gauNB,svc,dectree,ranfor,adaBC]
for amodel in models:
    amodel.fit(X_train,y_train)
    y_pred = amodel.predict(X_test)
    print(amodel)
    print(classification_report(y_test,y_pred))

LogisticRegression()
              precision    recall  f1-score   support

           0       0.76      0.71      0.74      2304
           1       0.83      0.86      0.85      3761

    accuracy                           0.81      6065
   macro avg       0.80      0.79      0.79      6065
weighted avg       0.81      0.81      0.81      6065

GaussianNB()
              precision    recall  f1-score   support

           0       0.67      0.20      0.31      2304
           1       0.66      0.94      0.77      3761

    accuracy                           0.66      6065
   macro avg       0.66      0.57      0.54      6065
weighted avg       0.66      0.66      0.60      6065

SVC()
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      2304
           1       0.77      0.90      0.83      3761

    accuracy                           0.78      6065
   macro avg       0.78      0.74      0.75      6065
weighted avg       0.78      0.78 

In [28]:
logReg.predict(extract_features("Great",freqs,process_review=process_review))

array([1])