In [1]:
from src.data_cleaning import *

%reload_ext autoreload
%autoreload 2
# import_functs()

With `import_ks_data()`, we will pull all of the raw kickstarter data from within the `data` folder, and combine it into one file. To avoid having to repeat this process, it automatically pickles the dataframe as `raw_ks_data.p`.


In [2]:
# df = import_ks_data()

If you're not running this notebook for the first time, then run `load_raw_ks_data` instead.

In [3]:
df = load_raw_ks_data()

In [4]:
df.head(3)

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,spotlight,staff_pick,state,state_changed_at,static_usd_rate,unread_messages_count,unseen_activity_count,urls,usd_pledged,usd_type
0,1,This is a project I created to find out why 10...,"{""id"":360,""name"":""Video"",""slug"":""journalism/vi...",20,US,the United States,1494022111,"{""id"":220745515,""name"":""Stephanie Balfrey"",""sl...",USD,$,...,False,False,failed,1499212951,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",20.0,domestic
1,82,Seek & Behold is a full length album paired wi...,"{""id"":318,""name"":""Faith"",""slug"":""music/faith"",...",12580,US,the United States,1477503356,"{""id"":1889961770,""name"":""Debrianna Grace Cabit...",USD,$,...,True,False,successful,1482825631,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",12580.0,domestic
2,30,After a lifetime of talking myself out of shar...,"{""id"":318,""name"":""Faith"",""slug"":""music/faith"",...",2491,US,the United States,1426640212,"{""id"":1600855781,""name"":""Liz Roberson"",""is_reg...",USD,$,...,True,False,successful,1429238223,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",2491.0,domestic


Some immidiately noticable things include the category column, which conains dictionaries. With a closer look...

In [5]:
df.category[0]

'{"id":360,"name":"Video","slug":"journalism/video","position":4,"parent_id":13,"parent_name":"Journalism","color":1228010,"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/journalism/video"}}}'

We can see that it contains important information, such as the category name and the parent category, if the project is within a subcategory. However most of the information isn't useful, so using `expand_cateogry` we expand that column, and extract just the `name` and `parent_name` columns, renaming them to `category_name` and `category_parent_name`

In [6]:
df = expand_category(df)
df.head(3)

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,currency_trailing_code,...,state,state_changed_at,static_usd_rate,unread_messages_count,unseen_activity_count,urls,usd_pledged,usd_type,category_name,category_parent_name
0,1,This is a project I created to find out why 10...,20,US,the United States,1494022111,"{""id"":220745515,""name"":""Stephanie Balfrey"",""sl...",USD,$,True,...,failed,1499212951,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",20.0,domestic,Video,Journalism
1,82,Seek & Behold is a full length album paired wi...,12580,US,the United States,1477503356,"{""id"":1889961770,""name"":""Debrianna Grace Cabit...",USD,$,True,...,successful,1482825631,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",12580.0,domestic,Faith,Music
2,30,After a lifetime of talking myself out of shar...,2491,US,the United States,1426640212,"{""id"":1600855781,""name"":""Liz Roberson"",""is_reg...",USD,$,True,...,successful,1429238223,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",2491.0,domestic,Faith,Music


In [7]:
df.category_parent_name.fillna(value = df.category_name, inplace = True)

In [8]:
df[['category_name','category_parent_name']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195334 entries, 0 to 195333
Data columns (total 2 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   category_name         195334 non-null  object
 1   category_parent_name  195334 non-null  object
dtypes: object(2)
memory usage: 3.0+ MB


In [9]:
df

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,currency_trailing_code,...,state,state_changed_at,static_usd_rate,unread_messages_count,unseen_activity_count,urls,usd_pledged,usd_type,category_name,category_parent_name
0,1,This is a project I created to find out why 10...,20,US,the United States,1494022111,"{""id"":220745515,""name"":""Stephanie Balfrey"",""sl...",USD,$,True,...,failed,1499212951,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",20.000000,domestic,Video,Journalism
1,82,Seek & Behold is a full length album paired wi...,12580,US,the United States,1477503356,"{""id"":1889961770,""name"":""Debrianna Grace Cabit...",USD,$,True,...,successful,1482825631,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",12580.000000,domestic,Faith,Music
2,30,After a lifetime of talking myself out of shar...,2491,US,the United States,1426640212,"{""id"":1600855781,""name"":""Liz Roberson"",""is_reg...",USD,$,True,...,successful,1429238223,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",2491.000000,domestic,Faith,Music
3,109,The St.Claire is a critical and creative force...,8300,US,the United States,1368571088,"{""id"":884672734,""name"":""THE ST.CLAIRE"",""is_reg...",USD,$,True,...,successful,1372813821,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",8300.000000,domestic,Journalism,Journalism
4,68,Trasformiamo un Bar in un Gamers' Café!!\nBirr...,7873,IT,Italy,1479044849,"{""id"":1385445347,""name"":""Emanuele Iannone"",""is...",EUR,€,False,...,successful,1483829940,1.058929,,,"{""web"":{""project"":""https://www.kickstarter.com...",7914.438485,domestic,Restaurants,Food
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195329,15,The Transporter is a floating island of light ...,2202,US,the United States,1394258709,"{""id"":1850262460,""name"":""Abram Santa Cruz"",""sl...",USD,$,True,...,successful,1395723901,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",2202.000000,domestic,Public Art,Art
195330,287,The culmination of years of research from diff...,11289,US,the United States,1431807042,"{""id"":448215363,""name"":""ZED Presents..."",""slug...",USD,$,True,...,successful,1435733944,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",11289.000000,domestic,Art Books,Publishing
195331,20,New album from Simon Scardanelli - narrative s...,983,GB,the United Kingdom,1448364875,"{""id"":212545136,""name"":""Simon Scardanelli"",""sl...",GBP,£,False,...,successful,1450976401,1.513441,,,"{""web"":{""project"":""https://www.kickstarter.com...",1000.384415,domestic,Country & Folk,Music
195332,0,Lovely & Derby needs to raise funds to get som...,0,US,the United States,1286032151,"{""id"":499367211,""name"":""Janice Priest"",""is_reg...",USD,$,True,...,failed,1326316165,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",0.000000,domestic,Performance Art,Art


Next, we're going to begin correcting data types.

All dates are in unix format, so the following four columns will be changed to datetime64[s]
    - `created_at`, `deadline`, `launched_at`, `state_changed`

On top of that, we'll be correcting the datatypes for a few more columns! 
- `country`, `currency`, `currency_symbol`, `category_name`, and `category_parent_name` will be changed to the pandas datatype `category`

- `state` will be updated to change `successful` into 1, and `failed` into 0

In [10]:
df = correct_dtypes(df)
df = drop_rows(df)
df = drop_cols(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inplace = True)


In [11]:
df['start_month'] = pd.DatetimeIndex(df['launched_at']).month
df['end_month'] = pd.DatetimeIndex(df['deadline']).month

In [12]:
blurb_df, df = split_and_pickle_df(df)

In [13]:
dict_list, df = create_dicts(df)

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [15]:
df_train, df_test = train_test_split(df, test_size = .2, random_state = seed)

In [16]:
col_list = ['category_parent_name', 'currency', 'country']

encoded_train_df = df_train
encoded_test_df = df_test
le = LabelEncoder()
LE_dict = {}

for i in col_list:
    le.fit(encoded_train_df[i])
    encoded_train_df[i] = le.transform(encoded_train_df[i])
    encoded_test_df[i] = le.transform(encoded_test_df[i])
    LE_dict[i] = le.classes_

LE_dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


{'category_parent_name': array(['Art', 'Comics', 'Crafts', 'Dance', 'Design', 'Fashion',
        'Film & Video', 'Food', 'Games', 'Journalism', 'Music',
        'Photography', 'Publishing', 'Technology', 'Theater'], dtype=object),
 'currency': array(['AUD', 'CAD', 'CHF', 'DKK', 'EUR', 'GBP', 'HKD', 'JPY', 'MXN',
        'NOK', 'NZD', 'SEK', 'SGD', 'USD'], dtype=object),
 'country': array(['AT', 'AU', 'BE', 'CA', 'CH', 'DE', 'DK', 'ES', 'FR', 'GB', 'HK',
        'IE', 'IT', 'JP', 'LU', 'MX', 'NL', 'NO', 'NZ', 'SE', 'SG', 'US'],
       dtype=object)}

In [17]:
encoded_train_df.to_pickle('data/KS_train_data.pkl')
encoded_test_df.to_pickle('data/KS_test_data.pkl')

In [18]:
dates = ['deadline', 'launched_at']

def expand_date(df, col):
    times = ['year', 'month', 'day']
    
    col_name = f"{col}_year"
    df[col_name] = df[col].year
    
    col_name = f"{col}_month"
    df[col_name] = df[col].month
    
    col_name = f"{col}_day"
    df[col_name] = df[col].day
    
    return df.drop(columns = col)

In [19]:
expand_date(df, 'deadline')

AttributeError: 'Series' object has no attribute 'year'