In [0]:
!rm -r ./cleaned_kickstarted_dataset train_dataset.csv

# Kickstarter project
## Model validation

In [0]:
import numpy as np
import pandas as pd
import os

### Loading data

In [0]:
GIT_DIR = 'cleaned_kickstarted_dataset'

In [4]:
!git clone https://github.com/Strongkong/cleaned_kickstarted_dataset

Cloning into 'cleaned_kickstarted_dataset'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)   [Kremote: Counting objects:  28% (2/7)   [Kremote: Counting objects:  42% (3/7)   [Kremote: Counting objects:  57% (4/7)   [Kremote: Counting objects:  71% (5/7)   [Kremote: Counting objects:  85% (6/7)   [Kremote: Counting objects: 100% (7/7)   [Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 23 (delta 1), reused 6 (delta 1), pack-reused 16[K
Unpacking objects: 100% (23/23), done.


In [5]:
zip = os.path.join(GIT_DIR, 'train_dataset.csv.zip')

!unzip $zip

Archive:  cleaned_kickstarted_dataset/train_dataset.csv.zip
  inflating: train_dataset.csv       


### Import dataset, define inputs and outputs

In [0]:
# Import the final version of dataset which contains only numeric values and ready to train the model on.
df = pd.read_csv(filepath_or_buffer='train_dataset.csv', sep=',', index_col=0)

In [0]:
# Shuffle first
df = df.sample(frac=1).reset_index(drop=True)

# We would like to predicate whether a kickstarter project will be successful.
# Don't need the backers and usd_pledged_real values (we can't even know them), nor the final state...
state_columns = df.columns[df.columns.str.startswith('state_')].values.tolist()
X = df.drop(state_columns + ['backers', 'usd_pledged_real'], axis=1)

# The ouptput will be the final state of the ks project
Y = df[df.columns.intersection(state_columns)]


# the ratio of the parts
test_split = 0.1
t_index = int(X.shape[0] * (1-test_split))
X_test = X[t_index:]
Y_test = Y[t_index:]

### Loading the best checkpoint, result from the training section

In [7]:
from keras.models import load_model

model = load_model(os.path.join(GIT_DIR, '01_weights.hdf5'))

Using TensorFlow backend.


### Calculating mean absolute error

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

preds = model.predict(X_test)
err = mean_squared_error(Y_test, preds)

print("Error on test data: {}".format(err))

Error on test data: 0.5668999865392381


In [13]:
print(preds[0][:4])
print(type(preds))
print(type(Y_test))

[1. 0.]
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [14]:
print(preds.shape)
print(Y_test.shape)

(14858, 2)
(14858, 2)


### Creating DataFrame from the predictions

In [0]:
pred_df = pd.DataFrame(data=preds)

In [16]:
pred_df.head()

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [17]:
Y_test.head()

Unnamed: 0,state_0,state_1
133716,0.0,1.0
133717,0.0,1.0
133718,1.0,0.0
133719,1.0,0.0
133720,0.0,1.0


### Inverse transforming the one hot encoding to states

In [0]:
Y_test_state = state_ohe.inverse_transform(Y_test)

In [0]:
Y_test_state[-5:]

In [0]:
type(Y_test_state)

### Inverse transforming the label encoded values back to state names

In [0]:
Y_test_state2 = state_le.inverse_transform(Y_test_state.astype(int))

In [0]:
Y_test_state2[:5]

### Finding the column id of the maximum value (e.g. the one that the network predicted to be the highest 'chance')

In [0]:
pred_test = pred_df.idxmax(axis=1)

### Transforming back the number to the label
This here is using the fact that the encodings are onehotencoded, and it is the same number as the column number

In [0]:
pred_inverted = state_le.inverse_transform(pred_test)

In [0]:
state_le.classes_

In [0]:
pred_inverted

In [0]:
pred_inverted.shape

In [0]:
Y_test_state2.shape

### Getting a sample of the predictions
Currently the network isn't as accurate as we would like it to be, we hope that in the future we can improve the accuracy

In [0]:
print("First 10")
for i in range(100, 130):
  print("Predicted ", pred_inverted[i], " |||| was:", Y_test_state2[i])