In [1]:
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import random
import csv

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# set seed
val = 52
random.seed(val)
np.random.seed(val)
tf.set_random_seed(val)

## 1. Load Data

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train.head()

Unnamed: 0,id,name,slug,path,competition-num,category,description,num-comments,published,modified,...,humor-average,mood-average,fun-rank,innovation-rank,theme-rank,graphics-rank,audio-rank,humor-rank,mood-rank,label
0,15185,I Just Wanted Groceries,i-just-wanted-groceries,/events/ludum-dare/38/i-just-wanted-groceries,38,jam,### *[Play the game!](https://excaliburjs.com/...,17,2017-04-25T01:01:15Z,2017-05-01T06:35:58Z,...,4.0,3.609,88,298,230,328,-1,38,202,4
1,15233,STACK,stack,/events/ludum-dare/38/stack,38,compo,![STACK_Cover.PNG](///raw/747/3/z/1b7a.png)<br...,33,2017-04-23T21:02:47Z,2017-04-30T22:40:41Z,...,3.708,3.692,147,61,55,136,98,48,83,4
2,15238,Conquer Earth,conquer-earth-1,/events/ludum-dare/38/conquer-earth-1,38,jam,Link: http://www.edeb8.com/LD38/index.php<br><...,50,2017-04-25T01:02:38Z,2017-04-30T04:42:44Z,...,2.19,3.429,175,130,598,528,66,530,279,4
3,15268,Attack of the Planetary-Vampires,attack-of-the-planetary-vampires,/events/ludum-dare/38/attack-of-the-planetary-...,38,jam,Game with @sawtan jam team **Bald'n'Hairy**!<b...,20,2017-04-24T19:43:02Z,2017-05-01T19:57:18Z,...,2.4,3.4,180,600,536,190,159,447,290,4
4,15271,It's square to be a cube,its-square-to-be-a-cube,/events/ludum-dare/38/its-square-to-be-a-cube,38,compo,"My first entry to Ludum dare, got stuck on som...",39,2017-04-24T01:08:09Z,2017-05-03T08:02:44Z,...,1.8,2.417,171,31,356,518,387,433,477,3


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21948 entries, 0 to 21947
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  21948 non-null  int64  
 1   name                21947 non-null  object 
 2   slug                21947 non-null  object 
 3   path                21948 non-null  object 
 4   competition-num     21948 non-null  int64  
 5   category            21948 non-null  object 
 6   description         21827 non-null  object 
 7   num-comments        21948 non-null  int64  
 8   published           21948 non-null  object 
 9   modified            21948 non-null  object 
 10  version             21948 non-null  int64  
 11  feedback-karma      21948 non-null  int64  
 12  ratings-given       21948 non-null  float64
 13  ratings-received    21948 non-null  float64
 14  links               18507 non-null  object 
 15  link-tags           18354 non-null  object 
 16  num-

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4959 entries, 0 to 4958
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  4959 non-null   int64  
 1   name                4959 non-null   object 
 2   slug                4959 non-null   object 
 3   path                4959 non-null   object 
 4   competition-num     4959 non-null   int64  
 5   category            4959 non-null   object 
 6   description         4922 non-null   object 
 7   num-comments        4959 non-null   int64  
 8   published           4959 non-null   object 
 9   modified            4959 non-null   object 
 10  version             4959 non-null   int64  
 11  feedback-karma      4959 non-null   int64  
 12  ratings-given       4959 non-null   float64
 13  ratings-received    4959 non-null   float64
 14  links               4894 non-null   object 
 15  link-tags           4890 non-null   object 
 16  num-au

## 2. Data Preprocessing

In [7]:
train["rank"] = train["audio-rank"] + train["humor-rank"] + train["innovation-rank"] + train["theme-rank"] + train["graphics-rank"] + train["mood-rank"] + train["fun-rank"]
train = train.drop(["id", "name", "slug", "path", "description", "links", "link-tags", 
                    "version", "num-comments", "competition-num", "published", "modified", 
                    "ratings-given", "num-authors", "prev-games", "feedback-karma", 
                    "audio-rank", "humor-rank", "innovation-rank", "theme-rank", 
                    "graphics-rank", "mood-rank", "fun-rank"], axis=1)
train.replace(-1, 0, inplace=True)

In [8]:
test = pd.read_csv('test.csv')
test_id = test["id"]
test["rank"] = test["audio-rank"] + test["humor-rank"] + test["innovation-rank"] + test["theme-rank"] + test["graphics-rank"] + test["mood-rank"] + test["fun-rank"]
test = test.drop(["id", "name", "slug", "path", "description", "links", "link-tags", 
                    "version", "num-comments", "competition-num", "published", "modified", 
                    "ratings-given", "num-authors", "prev-games", "feedback-karma", 
                    "audio-rank", "humor-rank", "innovation-rank", "theme-rank", 
                    "graphics-rank", "mood-rank", "fun-rank"], axis=1)
test.replace(-1, 0, inplace=True)

In [9]:
train.head()

Unnamed: 0,category,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,label,rank
0,jam,26.571429,3.84,3.28,3.72,3.68,0.0,4.0,3.609,4,1183
1,compo,28.25,3.519,3.815,4.037,3.815,3.52,3.708,3.692,4,628
2,jam,24.25,3.565,3.696,2.913,3.087,3.952,2.19,3.429,4,2306
3,jam,22.0,3.55,2.7,3.1,4.0,3.6,2.4,3.4,4,2402
4,compo,39.875,3.436,4.077,3.154,2.179,2.108,1.8,2.417,3,2373


In [10]:
test.head()

Unnamed: 0,category,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,rank
0,compo,5.0,4.0,4.0,4.333,3.833,0.0,3.0,4.0,-7
1,jam,14.875,2.577,2.654,3.577,3.577,3.654,3.042,3.308,-7
2,jam,38.875,3.716,3.77,4.176,4.378,3.595,3.824,3.75,2636
3,jam,3.625,3.25,3.0,3.25,2.75,0.0,3.0,3.0,-7
4,compo,39.25,3.816,3.105,3.632,3.566,3.921,2.456,3.292,2987


### 2.1 One-hot encoding

One-hot encoding creates a _"dummy"_ variable for each possible category of each non-numeric feature.

In [11]:
# make dummy variables for train
train_dummies = pd.get_dummies(train["category"], prefix = "category")#, drop_first=True)
# join dummy variables with original dataset
one_hot_train = pd.concat([train, train_dummies], axis=1)
# Drop the category column
one_hot_train = one_hot_train.drop("category", axis=1)

In [12]:
one_hot_train.head()

Unnamed: 0,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,label,rank,category_compo,category_jam
0,26.571429,3.84,3.28,3.72,3.68,0.0,4.0,3.609,4,1183,0,1
1,28.25,3.519,3.815,4.037,3.815,3.52,3.708,3.692,4,628,1,0
2,24.25,3.565,3.696,2.913,3.087,3.952,2.19,3.429,4,2306,0,1
3,22.0,3.55,2.7,3.1,4.0,3.6,2.4,3.4,4,2402,0,1
4,39.875,3.436,4.077,3.154,2.179,2.108,1.8,2.417,3,2373,1,0


In [13]:
# make dummy variables for train
test_dummies = pd.get_dummies(test["category"], prefix = "category")#, drop_first=True)
# join dummy variables with original dataset
one_hot_test = pd.concat([test, test_dummies], axis=1)
# Drop the category column
one_hot_test = one_hot_test.drop("category", axis=1)

In [14]:
one_hot_test.head()

Unnamed: 0,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,rank,category_compo,category_jam
0,5.0,4.0,4.0,4.333,3.833,0.0,3.0,4.0,-7,1,0
1,14.875,2.577,2.654,3.577,3.577,3.654,3.042,3.308,-7,0,1
2,38.875,3.716,3.77,4.176,4.378,3.595,3.824,3.75,2636,0,1
3,3.625,3.25,3.0,3.25,2.75,0.0,3.0,3.0,-7,0,1
4,39.25,3.816,3.105,3.632,3.566,3.921,2.456,3.292,2987,1,0


### 2.2 Spliting training dataset into features and labels 

In [15]:
features = one_hot_train.drop("label", axis=1)
label = one_hot_train["label"]

In [16]:
features, label

(       ratings-received  fun-average  innovation-average  theme-average  \
 0             26.571429        3.840               3.280          3.720   
 1             28.250000        3.519               3.815          4.037   
 2             24.250000        3.565               3.696          2.913   
 3             22.000000        3.550               2.700          3.100   
 4             39.875000        3.436               4.077          3.154   
 ...                 ...          ...                 ...            ...   
 21943          1.000000        0.000               0.000          0.000   
 21944        108.000000        2.888               3.127          2.833   
 21945         20.125000        3.861               2.882          3.472   
 21946         23.750000        3.227               3.273          2.286   
 21947         37.571429        3.076               3.621          3.618   
 
        graphics-average  audio-average  humor-average  mood-average  rank  \
 0      

### 2.3 Data scaling

In [17]:
# Initialize the Scaler 
scaler = MinMaxScaler().fit(features) 
features_scaled = scaler.transform(features)
features = pd.DataFrame(features_scaled, columns = list(features.columns.values)) 
features.head(10)

Unnamed: 0,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,rank,category_compo,category_jam
0,0.045012,0.768,0.656,0.744,0.736,0.0,0.8,0.7218,0.156661,0.0,1.0
1,0.047953,0.7038,0.763,0.8074,0.763,0.704,0.7416,0.7384,0.083597,1.0,0.0
2,0.040946,0.713,0.7392,0.5826,0.6174,0.7904,0.438,0.6858,0.304502,0.0,1.0
3,0.037005,0.71,0.54,0.62,0.8,0.72,0.48,0.68,0.317141,0.0,1.0
4,0.068316,0.6872,0.8154,0.6308,0.4358,0.4216,0.36,0.4834,0.313323,1.0,0.0
5,0.05277,0.6666,0.8934,0.7586,0.7934,0.5778,0.6358,0.7858,0.111638,1.0,0.0
6,0.038318,0.4858,0.3714,0.581,0.4476,0.3904,0.42,0.4,0.602817,0.0,1.0
7,0.06328,0.65,0.573,0.6388,0.627,0.6484,0.4866,0.6686,0.248025,1.0,0.0
8,0.090431,0.545,0.46,0.6628,0.4654,0.5266,0.676,0.516,0.487362,0.0,1.0
9,0.061966,0.6172,0.6914,0.6572,0.6,0.5516,0.542,0.653,0.242891,1.0,0.0


In [18]:
# Initialize the Scaler 
scaler = MinMaxScaler().fit(one_hot_test) 
test_scaled = scaler.transform(one_hot_test)
one_hot_test = pd.DataFrame(test_scaled, columns = list(one_hot_test.columns.values)) 
one_hot_test.head(10)

Unnamed: 0,ratings-received,fun-average,innovation-average,theme-average,graphics-average,audio-average,humor-average,mood-average,rank,category_compo,category_jam
0,0.00214,0.8,0.8,0.8666,0.7666,0.0,0.6,0.8,0.0,1.0,0.0
1,0.011534,0.5154,0.5308,0.7154,0.7154,0.7308,0.6084,0.6616,0.0,0.0,1.0
2,0.034364,0.7432,0.754,0.8352,0.8756,0.719,0.7648,0.75,0.167448,0.0,1.0
3,0.000832,0.65,0.6,0.65,0.55,0.0,0.6,0.6,0.0,0.0,1.0
4,0.034721,0.7632,0.621,0.7264,0.7132,0.7842,0.4912,0.6584,0.189686,1.0,0.0
5,0.035315,0.6324,0.7794,0.7538,0.5052,0.65,0.4834,0.7102,0.541118,0.0,1.0
6,0.017598,0.59,0.58,0.77,0.625,0.655,0.4534,0.7,0.650532,0.0,1.0
7,0.01736,0.6736,0.6736,0.6,0.721,0.0,0.7158,0.6578,0.475482,0.0,1.0
8,0.021981,0.6292,0.5042,0.6876,0.6416,0.0,0.613,0.5876,0.652369,0.0,1.0
9,0.004875,0.56,0.6,0.7166,0.8166,0.5666,0.6,0.7286,0.0,0.0,1.0


### 2.4 Converting data into vectors

In [19]:
final_features = np.array(features)
final_label = np.array(keras.utils.to_categorical(label, 6)) # we have 6 classes to predict so then our vector will have 6 columns and each column represent each class

In [20]:
final_features_test = np.array(one_hot_test)

In [21]:
final_features.shape, final_features_test.shape

((21948, 11), (4959, 11))

## 3. Building Neural Network Model

In [22]:
# Building the model
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(features.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(512, activation="tanh"))
model.add(Dropout(0.1))
model.add(Dense(6, activation='softmax'))

# Compiling the model
model.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              12288     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 3078      
Total params: 540,166
Trainable params: 540,166
Non-trainable params: 0
_________________________________________________________________


## 4. Training Model with Cross Validation

In [23]:
# Training the model with validation set
from keras.callbacks import ModelCheckpoint

# train the model
checkpointer = ModelCheckpoint(filepath='nn_model_path', verbose=1, save_best_only=True)

hist = model.fit(final_features, final_label, batch_size=100, epochs=20, validation_split=0.2, callbacks=[checkpointer], verbose=1)#, shuffle=True)

Train on 17558 samples, validate on 4390 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.18688, saving model to nn_model_path
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.18688
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.18688
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.18688
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.18688
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.18688
Epoch 7/20

Epoch 00007: val_loss improved from 0.18688 to 0.15461, saving model to nn_model_path
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.15461
Epoch 9/20

Epoch 00009: val_loss improved from 0.15461 to 0.14831, saving model to nn_model_path
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.14831
Epoch 11/20

Epoch 00011: val_loss improved from 0.14831 to 0.13987, saving model to nn_model_path
Epoch 12/20

Epoch 00012: val_loss did not improve from 0.13987
Epoch 13/20

Epoch 00013: val_loss did not improve from 

## 5. Predicting Test Dataset

In [24]:
result = model.predict_classes(final_features_test, verbose=1)



In [25]:
with open('submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id", "label"])
    for label_id, label_data in zip(test_id, result):
        writer.writerow([label_id, label_data])