In [51]:
import pandas as pd
import seaborn as sns
import numpy as np

In [78]:
df = pd.read_csv('data\Kickstarter_projects_Feb19.csv')

# remove duplicates 
df.drop_duplicates(keep='first', inplace = True)
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170730 entries, 0 to 170729
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             170730 non-null  int64  
 1   name           170730 non-null  object 
 2   currency       170730 non-null  object 
 3   main_category  170730 non-null  object 
 4   sub_category   170730 non-null  object 
 5   launched_at    170730 non-null  object 
 6   deadline       170730 non-null  object 
 7   duration       170730 non-null  float64
 8   goal_usd       170730 non-null  float64
 9   city           170730 non-null  object 
 10  state          170730 non-null  object 
 11  country        170730 non-null  object 
 12  blurb_length   170730 non-null  int64  
 13  name_length    170730 non-null  int64  
 14  status         170730 non-null  object 
 15  start_month    170730 non-null  int64  
 16  end_month      170730 non-null  int64  
 17  start_Q        170730 non-nul

In [69]:
df.head()

Unnamed: 0,id,name,currency,main_category,sub_category,launched_at,deadline,duration,goal_usd,city,state,country,blurb_length,name_length,status,start_month,end_month,start_Q,end_Q,usd_pledged
0,1687733153,Socks of Speed and Socks of Elvenkind,USD,games,Tabletop Games,2018-10-30 20:00:02,2018-11-15 17:59:00,16.0,2000.0,Menasha,WI,US,14,7,successful,10,11,Q4,Q4,6061.0
1,227936657,Power Punch Boot Camp: An All-Ages Graphic Novel,GBP,comics,Comic Books,2018-08-06 10:00:43,2018-09-05 10:00:43,30.0,3870.99771,Shepperton,England,GB,24,8,successful,8,9,Q3,Q3,3914.50512
2,454186436,"Live Printing with SX8: ""Squeegee Pulp Up""",USD,fashion,Apparel,2017-06-09 15:41:03,2017-07-09 15:41:03,30.0,1100.0,Manhattan,NY,US,21,7,successful,6,7,Q2,Q3,1110.0
3,629469071,Lost Dog Street Band's Next Album,USD,music,Country & Folk,2014-09-25 18:46:01,2014-11-10 06:00:00,45.0,3500.0,Nashville,TN,US,15,6,successful,9,11,Q3,Q4,4807.0
4,183973060,"Qto-X, a Tiny Lantern",USD,technology,Gadgets,2016-11-28 16:35:11,2017-01-27 16:35:11,60.0,30000.0,Troy,MI,US,15,4,successful,11,1,Q4,Q1,40368.0


## Model 1: Classification model 
- predict whether the project can be successfully funded
- Dependent variable: status
- Independent variable: main_category, duration, goal_usd, country, start_month

## Model 2: Regression model 
- predict how much the project can be funded
- Dependent variable: usd_pledged
- Independent variable: main_category, duration, goal_usd, country, start_month


In [79]:
## Drop columns
df.drop(['id','name','currency', 'sub_category', 'launched_at','deadline','city', 'state', 'blurb_length', 
'name_length', 'end_month', 'start_Q', 'end_Q'], axis=1, inplace = True)
df.head()

Unnamed: 0,main_category,duration,goal_usd,country,status,start_month,usd_pledged
0,games,16.0,2000.0,US,successful,10,6061.0
1,comics,30.0,3870.99771,GB,successful,8,3914.50512
2,fashion,30.0,1100.0,US,successful,6,1110.0
3,music,45.0,3500.0,US,successful,9,4807.0
4,technology,60.0,30000.0,US,successful,11,40368.0


## Label Encode for target variable

In [80]:
from sklearn.preprocessing import LabelEncoder

lbl_encode = LabelEncoder()
df['status'] = lbl_encode.fit_transform(df['status'])

# Print out keys
keys = lbl_encode.classes_
values = lbl_encode.transform (lbl_encode.classes_)
dictionary = dict (zip(keys, values))
print(dictionary)

{'failed': 0, 'successful': 1}


## Encode for independent categorical variable 
 - Location label: We noticed that majority (two-third) of the data is in USA. The rest is scattered in different countries 
   which is a imbalance dataset if we sample for all countries. Therefore, we categorize to column 'US', 1-US and 0-Non-US
 - main_category & start_month: One-Hot Encode


In [81]:
# Country
df['US_based'] = df['country'].apply(lambda x: 1 if x == 'US' else 0)
df.drop(['country'], axis = 1, inplace= True)

In [82]:
# main_category, start_month
from sklearn.preprocessing import OneHotEncoder

cat_col = df[['main_category', 'start_month']]
enc=OneHotEncoder(drop='first')
cat_col = enc.fit_transform(cat_col).toarray()


In [83]:
column_name = []
for col in enc.get_feature_names():
    column_name += [col[3:]]

df2 = pd.DataFrame(cat_col,columns = column_name)
df2.shape

(170730, 25)

In [84]:
final_df = pd.concat([df,df2],axis = 1)
final_df

Unnamed: 0,main_category,duration,goal_usd,status,start_month,usd_pledged,US_based,comics,crafts,dance,...,3,4,5,6,7,8,9,10,11,12
0,games,16.0,2000.00000,1,10,6061.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,comics,30.0,3870.99771,1,8,3914.505120,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,fashion,30.0,1100.00000,1,6,1110.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,music,45.0,3500.00000,1,9,4807.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,technology,60.0,30000.00000,1,11,40368.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170725,food,60.0,57858.66500,0,10,1.224260,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
170726,technology,47.0,115717.33000,0,11,157.273939,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
170727,food,30.0,30000.00000,0,2,0.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170728,art,60.0,1200.00000,0,8,143.000000,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [86]:
final_df.drop (['main_category', 'start_month'],axis =1,  inplace = True)

In [87]:
final_df.to_csv('processed_data.csv', index = False)