In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/kickstarter_projects.csv')

# convert column names to lowercase
df.columns = df.columns.str.lower()

# drop not needed variables 'id', 'name'
df = df.drop(columns=['id', 'name'])

# drop state Live because state is not finally determined and thus not usuable for predictions 
df = df.drop(df[df.state == 'Live'].index)

# clean entries
df = df.drop(df[df.goal == 0].index)

# convert 'launched' and 'deadline' columns to datetime
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])

# calculate the duration in days
df['duration'] = (df['deadline'] - df['launched']).dt.days

# create a new column 'launch_month' with the month extracted from the 'launched' column
df['launch_month'] = df['launched'].dt.month

# convert state to binary to use it as target variable 
cleanup_nums = {"state":{"Failed": 0, "Canceled": 0, "Suspended" :0, "Successful" : 1}}
df["org_state"] = df["state"]
df.replace(cleanup_nums, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# initialise encoder
ord_make = OrdinalEncoder()

# encode categorical variables
df['country_encoded'] = ord_make.fit_transform(df[['country']]).astype('int')
df['category_encoded'] = ord_make.fit_transform(df[['category']]).astype('int')
df['subcategory_encoded'] = ord_make.fit_transform(df[['subcategory']]).astype('int')
df.info()

In [None]:
# add log columns
df['goal_log'] = np.log1p(df['goal'])
df['pledged_log'] = np.log1p(df['pledged'])

In [None]:
# save new dataset
df.to_csv('data/binary_state_kickstarter_projects.csv', index=False)