In [21]:
#initial imports 
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import seaborn as sns
import matplotlib.pyplot as plt

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Get the absolute path of the current file/notebook
# If using Jupyter, use Path.cwd(). If using a .py script, use Path(__file__).parent
curr_dir = Path.cwd()

# Calculate the project root (adjust '.parent' count as needed)
# If your notebook is in 'project/notebooks/', the root is 1 level up
project_root = curr_dir.parent.parent 

# Add project root to system path so Python can find 'utils'
sys.path.append(str(project_root))

print(f"Project Root added to path: {project_root}")

from utils.feature_engineer_df import build_features 

#for the scaling and encoding 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
#cleanup 

Project Root added to path: /Users/surya/Documents/neue_fische/11_project/ds-ml-project_kickstarters


In [11]:
build_features(
    input_path=Path.cwd().resolve().parents[1] / "data" / "cleaned" / "kickstarter_cleaned.csv",
    output_path=Path.cwd().resolve().parents[1] / "data" / "feature" / "kickstarter_featured.csv",
    raw_path=Path.cwd().resolve().parents[1] / "data" / "raw" / "ks-projects-201801.csv",
    logger=logger
)

# Load Files as DataFrames
BASE_DIR = Path.cwd().resolve().parents[1]
data_file = BASE_DIR / "data" / "feature" / "kickstarter_featured.csv"

filepath = Path(data_file)

df = pd.read_csv(filepath, encoding='latin-1', low_memory=False)

INFO:__main__:Triggering cleaning pipeline using raw data: /Users/surya/Documents/neue_fische/11_project/ds-ml-project_kickstarters/data/raw/ks-projects-201801.csv
INFO:__main__:Starting Kickstarter data cleaning
INFO:__main__:Loaded 378,661 rows × 15 columns
INFO:__main__:Dropped columns: ['name', 'category', 'goal', 'pledged', 'currency', 'usd pledged']
INFO:__main__:Data cleaning completed
INFO:__main__:Saved: /Users/surya/Documents/neue_fische/11_project/ds-ml-project_kickstarters/data/cleaned/kickstarter_cleaned.csv
INFO:__main__:Saved: /Users/surya/Documents/neue_fische/11_project/ds-ml-project_kickstarters/data/cleaned/kickstarter_cleaned_with_cancelled.csv
INFO:__main__:Cleaning finished. Resuming feature engineering...
INFO:__main__:Starting feature engineering pipeline
INFO:__main__:Loaded 331675 rows
INFO:__main__:Final columns before save: ['id', 'main_category', 'deadline', 'launched', 'backers', 'country', 'usd_pledged_real', 'usd_goal_real', 'duration_days', 'target', 'm

In [12]:
df.head()

Unnamed: 0,id,main_category,deadline,launched,backers,country,usd_pledged_real,usd_goal_real,duration_days,target,...,usd_goal_bins,usd_pledged_bins,pledged_per_category,goal_per_category,category_goal_percentile,duration_bins,backers_per_pledged,backer_pledged_bins,launch_season,deadline_season
0,1000003930,Film & Video,2017-11-01,2017-09-02 04:43:57,15,US,2421.0,30000.0,59,0,...,Very High,High,7676.247109,58616.915835,Very High,6 weeks,0.006196,Very Low,Fall,Fall
1,1000004038,Film & Video,2013-02-26,2013-01-12 00:20:50,3,US,220.0,45000.0,44,0,...,Very High,Low,7676.247109,58616.915835,Very High,4 weeks,0.013636,Low,Winter,Winter
2,1000007540,Music,2012-04-16,2012-03-17 03:24:11,1,US,1.0,5000.0,29,0,...,Medium,Very Low,4697.431965,11558.623284,Medium,2 weeks,1.0,Very High,Spring,Spring
3,1000014025,Food,2016-04-01,2016-02-26 13:38:27,224,US,52375.0,50000.0,34,1,...,Very High,Very High,6505.672844,30502.224195,Very High,4 weeks,0.004277,Very Low,Winter,Spring
4,1000023410,Food,2014-12-21,2014-12-01 18:30:44,16,US,1205.0,1000.0,19,1,...,Very Low,Medium,6505.672844,30502.224195,Very Low,2 weeks,0.013278,Low,Winter,Winter


In [13]:
df.columns

Index(['id', 'main_category', 'deadline', 'launched', 'backers', 'country',
       'usd_pledged_real', 'usd_goal_real', 'duration_days', 'target',
       'main_category_grouped', 'continent', 'launched_year', 'launched_month',
       'deadline_year', 'deadline_month', 'usd_goal_bins', 'usd_pledged_bins',
       'pledged_per_category', 'goal_per_category', 'category_goal_percentile',
       'duration_bins', 'backers_per_pledged', 'backer_pledged_bins',
       'launch_season', 'deadline_season'],
      dtype='object')

### Creating our feature engineering dataset 

In [14]:
#list of columns to "hard drop" from feature engineering dataframe
columns_to_drop = ['id', #irrelevant
                   'main_category', #substituted in a satisfactory way
                   'deadline', 'launched', #created new categories 
                   'backers', 'usd_pledged_real', 'usd_pledged_bins', 'backers_per_pledged', 'backer_pledged_bins', 'pledged_per_category', #everything to do with "future information"
                   'launched_year', 'deadline_year', #info about the past and not seasonal
                   ]

In [15]:
#to remember: These would need to be soft-dropped later 
columns_to_softdrop = ['country', #to play around with
                       'duration_days', #because the bins are not ideal
                       'launched_month', 'deadline_month', #because we have season but might want to look closer
                       ]

In [18]:
dfc = df.drop(columns=columns_to_drop)
dfc.head()

Unnamed: 0,country,usd_goal_real,duration_days,target,main_category_grouped,continent,launched_month,deadline_month,usd_goal_bins,goal_per_category,category_goal_percentile,duration_bins,launch_season,deadline_season
0,US,30000.0,59,0,Entertainment,North America,9,11,Very High,58616.915835,Very High,6 weeks,Fall,Fall
1,US,45000.0,44,0,Entertainment,North America,1,2,Very High,58616.915835,Very High,4 weeks,Winter,Winter
2,US,5000.0,29,0,Entertainment,North America,3,4,Medium,11558.623284,Medium,2 weeks,Spring,Spring
3,US,50000.0,34,1,Consumer,North America,2,4,Very High,30502.224195,Very High,4 weeks,Winter,Spring
4,US,1000.0,19,1,Consumer,North America,12,12,Very Low,30502.224195,Very Low,2 weeks,Winter,Winter


#### Prepare categorical features for machine learning algorithms

We want to get dummies for all categoricals, that is 
* season_launched, season_deadline
* main_category_grouped
* continent

Open question: Scale or do dummies? 
* category_goal_percentile (as it's ordinal)
* duration bins (ordinal as well)

Scale:  
* usd_goal_real
* goal_per_category

Get dummies:

#question: how do we later know the legend? 

In [20]:
dfc.columns

Index(['country', 'usd_goal_real', 'duration_days', 'target',
       'main_category_grouped', 'continent', 'launched_month',
       'deadline_month', 'usd_goal_bins', 'goal_per_category',
       'category_goal_percentile', 'duration_bins', 'launch_season',
       'deadline_season'],
      dtype='object')

In [7]:
#season launched, dropping first as it's multicollinear
df_sl = pd.get_dummies(df['launch_season'], prefix = 'sl_', drop_first=True, dtype=int)

#season deadline, dropping first
df_sd = pd.get_dummies(df['deadline_season'], prefix = 'sd_', drop_first=True, dtype=int)

#main category_grouped, dropping first
df_cat = pd.get_dummies(df['main_category_grouped'], prefix = 'cat_', drop_first=True, dtype=int)

#continent, dropping first
df_co = pd.get_dummies(df['continent'], prefix = 'co_', drop_first=True, dtype=int)

#put everything back together again: 
dff = pd.concat([df, df_sl, df_sd, df_cat, df_co], axis=1)

In [23]:
#use ordinal encoder for the ordinal categories 
encoder = OrdinalEncoder()
#make categories numerical 
#ordinal_encod_map = {'Very Low': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}
#dff['category_goal_percentile'] = dff['category_goal_percentile'].map(ordinal_encod_map)
df['duration_bins_coded'] = encoder.fit_transform(df[['duration_bins']])
df[['duration_bins', 'duration_bins_coded']].head(50)

Unnamed: 0,duration_bins,duration_bins_coded
0,6 weeks,2.0
1,4 weeks,1.0
2,2 weeks,0.0
3,4 weeks,1.0
4,2 weeks,0.0
5,4 weeks,1.0
6,2 weeks,0.0
7,2 weeks,0.0
8,2 weeks,0.0
9,2 weeks,0.0


In [8]:
#to remember: These would need to be soft-dropped later 
columns_to_softdrop = ['country', #to play around with
                       'duration_days', #because the bins are not ideal
                       'launched_month', 'deadline_month', #because we have season but might want to look closer
                       'launch_season', 'deadline_season', 'main_category_grouped', 'continent' #after creating dummies, get rid of these!
                       ]

### Scaling
Let's try at least two ways of doing this