In [43]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
df = pd.read_csv('../Datasets/train/steam_ds.csv')

In [5]:
df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,Positive
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",Positive
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",Positive
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",Positive
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,Positive


In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Seperate data into train and test split

### Extract document and Target labels from the dataset

In [6]:
document = df['user_review']
sentiment = df['user_suggestion']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(document, sentiment, test_size=0.20, random_state=42)

In [8]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Training Classifier

### Random Forrest

### Normal Technique

In [30]:
rf_normal = RandomForestClassifier()
rf_normal.fit(X_train, y_train)
y_pred = rf_normal.predict(X_test)

['Negative' 'Negative' 'Positive' ... 'Positive' 'Negative' 'Negative']


In [40]:
acc = round(accuracy_score(y_test,y_pred), 3)

In [41]:
print(acc)

0.829


In [9]:
rf = RandomForestClassifier()
scores = cross_val_score(rf,X_train,y_train,cv=5)

In [10]:
print(scores)
scores.mean()

[0.81707753 0.80957485 0.80635941 0.81850661 0.82636656]


0.8155769917827795

In [11]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5,50,100],
    'max_depth': [2,10,20,None]
}

cv = GridSearchCV(rf,parameters)
cv.fit(X_train,y_train)

NameError: name 'print_results' is not defined

In [13]:
print_results(cv)

BEST PARAMS: {'max_depth': None, 'n_estimators': 100}

0.584 (+/-0.017) for {'max_depth': 2, 'n_estimators': 5}
0.568 (+/-0.002) for {'max_depth': 2, 'n_estimators': 50}
0.568 (+/-0.0) for {'max_depth': 2, 'n_estimators': 100}
0.636 (+/-0.015) for {'max_depth': 10, 'n_estimators': 5}
0.626 (+/-0.006) for {'max_depth': 10, 'n_estimators': 50}
0.62 (+/-0.006) for {'max_depth': 10, 'n_estimators': 100}
0.676 (+/-0.019) for {'max_depth': 20, 'n_estimators': 5}
0.707 (+/-0.011) for {'max_depth': 20, 'n_estimators': 50}
0.711 (+/-0.008) for {'max_depth': 20, 'n_estimators': 100}
0.716 (+/-0.018) for {'max_depth': None, 'n_estimators': 5}
0.811 (+/-0.01) for {'max_depth': None, 'n_estimators': 50}
0.819 (+/-0.015) for {'max_depth': None, 'n_estimators': 100}


In [14]:
rf2 = RandomForestClassifier(n_estimators=100,max_depth=None)
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred), 3)

In [17]:
accuracy

0.832

### Naive Bayes

### Normal Technique

In [21]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
predicted = MNB.predict(X_test)
accuracy_score = accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

79.82%


In [24]:
MNB_cv = MultinomialNB()
scores = cross_val_score(MNB_cv,X_train,y_train,cv=5)
print(scores)
scores.mean()

[0.78385138 0.78349411 0.78277956 0.78670954 0.78027867]


0.7834226509467667

### Logistic Regression

In [44]:
lr = LogisticRegression()
grid = {"C":np.logspace(-3,3,7), "penalty":["l1", "l2"]}
grid_lr = GridSearchCV(lr, grid, cv=10)
grid_lr.fit(X_train, y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver optio

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [45]:
grid_lr.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [46]:
lr = LogisticRegression(C =1.0, penalty='l2')

In [48]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
predicted = lr.predict(X_test)

In [50]:
print(accuracy_score(predicted, y_test))

0.8576736210345813
