In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.svm import SVC

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
df = pd.read_csv('../Datasets/train/steam_ds.csv')

In [4]:
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",Positive
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",Positive
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",Positive
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,Positive


In [5]:
df['user_review'] = df['user_review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,i'm scared and hearing creepy voices. so i'll ...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,"best game, more better than sam pepper's youtu...",Positive
2,3,Spooky's Jump Scare Mansion,2016.0,"a littly iffy on the controls, but once you kn...",Positive
3,4,Spooky's Jump Scare Mansion,2015.0,"great game, fun and colorful and all that.a si...",Positive
4,5,Spooky's Jump Scare Mansion,2015.0,not many games have the cute tag right next to...,Positive


In [6]:
df['user_review'] = df['user_review'].str.replace('[^\w\s]','')
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,im scared and hearing creepy voices so ill pau...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,best game more better than sam peppers youtube...,Positive
2,3,Spooky's Jump Scare Mansion,2016.0,a littly iffy on the controls but once you kno...,Positive
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun and colorful and all thata side...,Positive
4,5,Spooky's Jump Scare Mansion,2015.0,not many games have the cute tag right next to...,Positive


In [7]:
stop = stopwords.words('english')
df['user_review'] = df['user_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,im scared hearing creepy voices ill pause mome...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,best game better sam peppers youtube account 1...,Positive
2,3,Spooky's Jump Scare Mansion,2016.0,littly iffy controls know play easy master ive...,Positive
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun colorful thata side note though...,Positive
4,5,Spooky's Jump Scare Mansion,2015.0,many games cute tag right next horror tag stea...,Positive


In [8]:
labels = df[['user_suggestion']]
text = df[['user_review']]

In [9]:
#X_train, x_test, Y_train,Y_test  = train_test_split(text, labels, test_size = 0.15, random_state=42)

In [10]:
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
df['user_review']= df['user_review'].apply(space)
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,-PRON- be scare hear creepy voice ill pause mo...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,good game well sam peppers youtube account 101...,Positive
2,3,Spooky's Jump Scare Mansion,2016.0,littly iffy control know play easy master -PRO...,Positive
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun colorful thata side note though...,Positive
4,5,Spooky's Jump Scare Mansion,2015.0,many game cute tag right next horror tag steam...,Positive


In [12]:
from sklearn import preprocessing

In [13]:
le = preprocessing.LabelEncoder()
df['user_suggestion'] = le.fit_transform(df['user_suggestion'])
df.head(100)

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,-PRON- be scare hear creepy voice ill pause mo...,1
1,2,Spooky's Jump Scare Mansion,2016.0,good game well sam peppers youtube account 101...,1
2,3,Spooky's Jump Scare Mansion,2016.0,littly iffy control know play easy master -PRO...,1
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun colorful thata side note though...,1
4,5,Spooky's Jump Scare Mansion,2015.0,many game cute tag right next horror tag steam...,1
...,...,...,...,...,...
95,96,Spooky's Jump Scare Mansion,2015.0,early access reviewi love game -PRON- be curre...,1
96,97,Spooky's Jump Scare Mansion,2015.0,wander 10 room without see anything get startl...,1
97,98,Spooky's Jump Scare Mansion,2015.0,early access reviewthe game start cute jump sc...,1
98,99,Spooky's Jump Scare Mansion,2018.0,pretty obviously part seeminglynormalthencreep...,1


In [15]:
from nltk.tokenize import RegexpTokenizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['user_review'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts,df['user_suggestion'] ,test_size=0.25, random_state=5)

In [19]:
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_train, Y_train)
#gnb.fit(data_train, target_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [21]:
from sklearn import metrics

In [22]:
predicted = svclassifier.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

56.74%


In [25]:
recall = recall_score( Y_test, predicted, average='weighted')
print('Recall: %.3f' % recall)

Recall: 0.567
