In [1]:
import pandas as pd
import numpy as np
import random

# set seed
val = 52
random.seed(val)
np.random.seed(val)

# 1. Load Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. Data Cleaning

In [3]:
train["rank"] = train["audio-rank"] + train["humor-rank"] + train["innovation-rank"] + train["theme-rank"] + train["graphics-rank"] + train["mood-rank"] + train["fun-rank"]
test["rank"] = test["audio-rank"] + test["humor-rank"] + test["innovation-rank"] + test["theme-rank"] + test["graphics-rank"] + test["mood-rank"] + test["fun-rank"]

test_id = test["id"]

features_drop = ["id", "name", "slug", "path", "description", "links", "link-tags", 
                "version", "num-comments", "competition-num", "published", "modified", 
                "ratings-given","ratings-received", "num-authors", "prev-games", "feedback-karma", 
                "audio-rank", "humor-rank", "innovation-rank", "theme-rank", 
                "graphics-rank", "mood-rank", "fun-rank"]
train = train.drop(features_drop, axis = 1)
test = test.drop(features_drop, axis = 1)

train.replace(-1, 0, inplace=True)
test.replace(-1, 0, inplace=True)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21948 entries, 0 to 21947
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   category            21948 non-null  object 
 1   fun-average         21948 non-null  float64
 2   innovation-average  21948 non-null  float64
 3   theme-average       21948 non-null  float64
 4   graphics-average    21948 non-null  float64
 5   audio-average       21948 non-null  float64
 6   humor-average       21948 non-null  float64
 7   mood-average        21948 non-null  float64
 8   label               21948 non-null  int64  
 9   rank                21948 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 1.7+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4959 entries, 0 to 4958
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   category            4959 non-null   object 
 1   fun-average         4959 non-null   float64
 2   innovation-average  4959 non-null   float64
 3   theme-average       4959 non-null   float64
 4   graphics-average    4959 non-null   float64
 5   audio-average       4959 non-null   float64
 6   humor-average       4959 non-null   float64
 7   mood-average        4959 non-null   float64
 8   rank                4959 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 348.8+ KB


In [6]:
category_mapping = {"jam":0, "compo": 1}
train_test_data = [train, test] #combining train and test dataset

for dataset in train_test_data:
    dataset["category"] = dataset["category"].map(category_mapping)

In [7]:
train.head()

Unnamed: 0,category,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,label,rank
0,0,3.84,3.28,3.72,3.68,0.0,4.0,3.609,4,1183
1,1,3.519,3.815,4.037,3.815,3.52,3.708,3.692,4,628
2,0,3.565,3.696,2.913,3.087,3.952,2.19,3.429,4,2306
3,0,3.55,2.7,3.1,4.0,3.6,2.4,3.4,4,2402
4,1,3.436,4.077,3.154,2.179,2.108,1.8,2.417,3,2373


In [8]:
test.head()

Unnamed: 0,category,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,rank
0,1,4.0,4.0,4.333,3.833,0.0,3.0,4.0,-7
1,0,2.577,2.654,3.577,3.577,3.654,3.042,3.308,-7
2,0,3.716,3.77,4.176,4.378,3.595,3.824,3.75,2636
3,0,3.25,3.0,3.25,2.75,0.0,3.0,3.0,-7
4,1,3.816,3.105,3.632,3.566,3.921,2.456,3.292,2987


In [9]:
target = train["label"]
train_data = train.drop("label", axis = 1)
train_data.shape, target.shape

((21948, 9), (21948,))

## 3. Modelling

In [10]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [11]:
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 0)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC

In [13]:
# knn
clf = KNeighborsClassifier()#(n_neighbors = 13)
scoring = "accuracy"
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs = 1, scoring=scoring)
round(np.mean(score)*100, 4)

85.2288

In [14]:
# Decision Tree
clf = DecisionTreeClassifier()
scoring = "accuracy"
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs = 1, scoring=scoring)
round(np.mean(score)*100, 4)

91.5391

In [15]:
# Random Forest
clf = RandomForestClassifier()#(n_estimators = 13)
scoring = "accuracy"
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs = 1, scoring=scoring)
round(np.mean(score)*100, 4)

94.168

In [16]:
# naive Bayes
clf = GaussianNB()
scoring = "accuracy"
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs = 1, scoring=scoring)
round(np.mean(score)*100, 4)

85.3745

## 4. Predicting

In [17]:
clf = RandomForestClassifier()#(n_estimators = 13)
clf.fit(train_data, target)

# test_data = test.drop("id", axis = 1).copy()
# prediction = clf.predict(test_data)
prediction = clf.predict(test)

In [18]:
submission = pd.DataFrame({
    "id":test_id, 
    "label":prediction
})

In [19]:
submission.to_csv("submission_random_forest.csv", index = False)
submission.head(20)

Unnamed: 0,id,label
0,187282,0
1,191602,0
2,180566,4
3,182197,0
4,189638,4
5,199600,3
6,182223,3
7,185912,3
8,203413,3
9,185184,0
