In [2]:
import pandas as pd

In [33]:
# Read the CSV with kickstarter data
ks = pd.read_csv("./data/ks-projects-201801.csv", parse_dates=["deadline", "launched"])

In [34]:
# Check the first couple of line to get an idea about the data format
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [35]:
# Check the columns and data types
ks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null datetime64[ns]
goal                378661 non-null float64
launched            378661 non-null datetime64[ns]
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: datetime64[ns](2), float64(5), int64(2), object(6)
memory usage: 43.3+ MB


In [36]:
# Which values does the "state" column take?
ks["state"].unique()


array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [37]:
# Count the number of occurences of the "state" values
ks.groupby(by="state")["ID"].count()


state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

In [38]:
# Remove the kickstarter projects which are still live
ks = ks.query('state != "live"')


In [39]:
# Make new outcome column which is 1 if the project was successful and 0 otherwise
ks = ks.assign(outcome=(ks["state"] == 'successful').astype(int))

In [40]:
ks["launched"].head()
#ks = ks.assign(hour=ks.launched.dt.hour)
#ks.info()

0   2015-08-11 12:12:28
1   2017-09-02 04:43:57
2   2013-01-12 00:20:50
3   2012-03-17 03:24:11
4   2015-07-04 08:35:03
Name: launched, dtype: datetime64[ns]

In [41]:
ks = ks.assign(
    year=ks.launched.dt.year,
    month=ks.launched.dt.month,
    day=ks.launched.dt.day,
    hour=ks.launched.dt.hour)


In [53]:
# Encode categorial columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorial_cols = ["category", "main_category", "currency", "country"]
encoded = ks[categorial_cols].apply(label_encoder.fit_transform)


In [54]:
encoded.head(5)



Unnamed: 0,category,main_category,currency,country
0,108,12,5,9
1,93,6,13,22
2,93,6,13,22
3,90,10,13,22
4,55,6,13,22


In [55]:
ks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375862 entries, 0 to 378660
Data columns (total 20 columns):
ID                  375862 non-null int64
name                375858 non-null object
category            375862 non-null object
main_category       375862 non-null object
currency            375862 non-null object
deadline            375862 non-null datetime64[ns]
goal                375862 non-null float64
launched            375862 non-null datetime64[ns]
pledged             375862 non-null float64
state               375862 non-null object
backers             375862 non-null int64
country             375862 non-null object
usd pledged         372066 non-null float64
usd_pledged_real    375862 non-null float64
usd_goal_real       375862 non-null float64
outcome             375862 non-null int64
year                375862 non-null int64
month               375862 non-null int64
day                 375862 non-null int64
hour                375862 non-null int64
dtypes: datetim

In [58]:
# Since ks and encoded have the same index and I can easily join them
data = ks[[ 'outcome', 'goal', 'hour', 'day', 'month', 'year']].join(encoded)

In [59]:
data.head(5)



Unnamed: 0,outcome,goal,hour,day,month,year,category,main_category,currency,country
0,0,1000.0,12,11,8,2015,108,12,5,9
1,0,30000.0,4,2,9,2017,93,6,13,22
2,0,45000.0,0,12,1,2013,93,6,13,22
3,0,5000.0,3,17,3,2012,90,10,13,22
4,0,19500.0,8,4,7,2015,55,6,13,22


In [73]:
# Split the data into a train, test and validation set
# 1. Shuffle the rows
data = data.sample(frac=1).reset_index(drop=True)
val_frac = 0.1
test_frac = val_frac
train_frac = 1 - (val_frac + test_frac)

def sample_size(fraction):
    return int(len(data) * fraction)

valid = data[:sample_size(fraction=val_frac)]
test = data[sample_size(fraction=val_frac):sample_size(fraction=val_frac)+sample_size(fraction=test_frac)]
train = data[sample_size(fraction=val_frac)+sample_size(fraction=test_frac):]

assert data.count()["outcome"] == valid.count()["outcome"] + test.count()["outcome"] + train.count()["outcome"]


In [74]:
print("Samples in train dataset:", train.count()["outcome"])
print("Samples in test dataset:", test.count()["outcome"])
print("Samples in validation dataset:", valid.count()["outcome"])

Samples in train dataset: 300690
Samples in test dataset: 37586
Samples in validation dataset: 37586


In [77]:
# Test the proportion of succesful outcome in the datasets
for dataset in [train, test, valid]:
    # print(f"Outcome distribtion = {dataset.outcome.mean():.4f")
    print(f"Outcome fraction = {dataset.outcome.mean():.4f}")


Outcome fraction = 0.3565
Outcome fraction = 0.3532
Outcome fraction = 0.3584
