In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import pandas as pd
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('https://rawgit.com/Qaqi/cs210project/master/ign_new.csv',sep=';',index_col=['gameid'])

In [3]:
data.count()

score_phrase         5265
score_phrase_enum    5265
title                5265
score                5265
genre                5265
genre_enum           5265
editors_choice       5265
release_year         5265
release_month        5265
dtype: int64

In [4]:
def preprocess_ign_df(df):
    processed_df = df.copy()
    le = preprocessing.LabelEncoder()
    processed_df = processed_df.drop(['score_phrase','title','genre'],axis=1)
    return processed_df

In [5]:
processed_df = preprocess_ign_df(data)

In [6]:
X = processed_df.drop(['editors_choice'], axis=1).values
y= processed_df['editors_choice'].values

In [7]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.5)

In [8]:
clf_dt = tree.DecisionTreeClassifier(max_depth=10)

In [9]:
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)

0.94986707178123808

In [10]:
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [11]:
test_classifier(clf_dt)

Accuracy: 0.9511 (+/- 0.01)


In [12]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
test_classifier(clf_rf)

Accuracy: 0.9597 (+/- 0.01)


In [13]:
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)

Accuracy: 0.9616 (+/- 0.01)


In [14]:
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)

Accuracy: 0.9608 (+/- 0.01)


In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn_alg = ske.BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
test_classifier(knn_alg)

Accuracy: 0.9164 (+/- 0.03)


In [16]:
#Logaritmic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

93.920000000000002

In [17]:
coeff_df = pd.DataFrame(processed_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,genre_enum,2.892402
0,score,1.617208
2,editors_choice,0.002568
3,release_year,-0.018571
4,release_month,-0.05314


In [18]:
#Support Vector Machine
svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train) * 100, 2)
acc_svc

95.930000000000007

In [19]:
#K-NN
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

93.620000000000005

In [31]:
#Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian

90.349999999999994

In [21]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc

78.530000000000001

In [23]:
#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree

99.730000000000004

In [32]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
               'Decision Tree','Naive Bayes'],
    'Accuracy': [acc_svc, acc_knn, acc_log, 
              acc_decision_tree, acc_gaussian]})
models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Model
3,99.73,Decision Tree
0,95.93,Support Vector Machines
2,93.92,Logistic Regression
1,93.62,KNN
4,90.35,Naive Bayes
