In [None]:
#initial imports 
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import seaborn as sns
import matplotlib.pyplot as plt

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Get the absolute path of the current file/notebook
# If using Jupyter, use Path.cwd(). If using a .py script, use Path(__file__).parent
curr_dir = Path.cwd()

# Calculate the project root (adjust '.parent' count as needed)
# If your notebook is in 'project/notebooks/', the root is 1 level up
project_root = curr_dir.parent.parent 

# Add project root to system path so Python can find 'utils'
sys.path.append(str(project_root))

print(f"Project Root added to path: {project_root}")

from utils.feature_engineer_df import build_features 

#for the scaling and encoding 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#cleanup 

pd.set_option('display.max_columns', None)

In [None]:
build_features(
    input_path=Path.cwd().resolve().parents[1] / "data" / "cleaned" / "kickstarter_cleaned.csv",
    output_path=Path.cwd().resolve().parents[1] / "data" / "feature" / "kickstarter_featured.csv",
    raw_path=Path.cwd().resolve().parents[1] / "data" / "raw" / "ks-projects-201801.csv",
    logger=logger
)

# Load Files as DataFrames
BASE_DIR = Path.cwd().resolve().parents[1]
data_file = BASE_DIR / "data" / "feature" / "kickstarter_featured.csv"

filepath = Path(data_file)

df = pd.read_csv(filepath, encoding='latin-1', low_memory=False)

In [None]:
df.head()

In [None]:
df.columns

### Creating our feature engineering dataset 

In [None]:
#list of columns to "hard drop" from feature engineering dataframe
columns_to_drop = ['id', #irrelevant
                   'main_category', #substituted in a satisfactory way
                   'deadline', 'launched', #created new categories 
                   'backers', 'usd_pledged_real', 'usd_pledged_bins', 'backers_per_pledged', 'backer_pledged_bins', 'pledged_per_category', #everything to do with "future information"
                   'launched_year', 'deadline_year', #info about the past and not seasonal
                   ]

In [None]:
#to remember: These would need to be soft-dropped later 
# columns_to_softdrop = ['country', #to play around with
#                        'duration_days', #because the bins are not ideal
#                        'launched_month', 'deadline_month', #because we have season but might want to look closer
#                        ]

In [None]:
dfc = df.drop(columns=columns_to_drop)
dfc.columns

#### Prepare categorical features for machine learning algorithms

We want to get dummies for all categoricals, that is 
* season_launched, season_deadline
* main_category_grouped
* continent

Open question: Scale or do dummies? 
* category_goal_percentile (as it's ordinal)
* duration bins (ordinal as well)

Scale:  
* usd_goal_real
* goal_per_category

Get dummies:

#question: how do we later know the legend? 

In [None]:
dfc.columns

In [None]:
#season launched, dropping first as it's multicollinear
df_sl = pd.get_dummies(df['launch_season'], prefix = 'sl_', drop_first=True, dtype=int)

#season deadline, dropping first
df_sd = pd.get_dummies(df['deadline_season'], prefix = 'sd_', drop_first=True, dtype=int)

#main category_grouped, dropping first
df_cat = pd.get_dummies(df['main_category_grouped'], prefix = 'cat_', drop_first=True, dtype=int)

#continent, dropping first
df_co = pd.get_dummies(df['continent'], prefix = 'co_', drop_first=True, dtype=int)

#put everything back together again: 
dff = pd.concat([dfc, df_sl, df_sd, df_cat, df_co], axis=1)

#check if it worked
display(dff.head()) 
#check for errors in creation 
display(dff.isna().sum())

Tried to use ordinal encoder, the results don't look right. Dropped it for now. 

#use ordinal encoder for the ordinal categories 
encoder = OrdinalEncoder()
#make categories numerical 
#ordinal_encod_map = {'Very Low': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}
#dff['category_goal_percentile'] = dff['category_goal_percentile'].map(ordinal_encod_map)
df['duration_bins_coded'] = encoder.fit_transform(df[['duration_bins']])
df[['duration_bins', 'duration_bins_coded']].head(50)

In [None]:
dff.columns

In [None]:
#let's soft-drop everything we just encoded
#commented out all the keepers 
columns_to_softdrop = ['country', #we kept it for comparison
                       # 'usd_goal_real', #right now I want to try scaling actual values 
                       #'duration_days', #I want to scale these and drop the bins instead for now 
                       # 'target', (obviously)
                       'main_category_grouped', 'continent', #after creating dummies, get rid of these!
                        'launched_month', 'deadline_month', #because we have season but might want to look closer
                        'usd_goal_bins', #using category_goal_percentile (those two are redundant)
                        #'goal_per_category', #it's a polynomial feature - not independent but that's probably ok
                       'category_goal_percentile', #it's an orinal bin so keeping 'goal per category' instead
                       'duration_bins', #want to use actual values instead, using duration_days
                       'launch_season', 'deadline_season', #gotten dummies 
                       #'duration_bins_coded', #dropped the whole encoding code 
                       ]
# keeping the already dummied ones obviously 



In [None]:
df_to_scale = dff.drop(columns=columns_to_softdrop, axis=1)
display(df_to_scale.columns)
display(df_to_scale.head())

### Scaling
Let's try at least two ways of doing this

In [None]:
#first, create our dfs 
X = df_to_scale.drop(columns=['target'])
y = df_to_scale['target']



In [None]:
#get train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
print("Df before", df_to_scale.shape)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

In [None]:
#let's standardise first: 
col_scale = ['usd_goal_real',
             'duration_days',
             'goal_per_category',
             ]

#instantiate
scaler = StandardScaler()
#scale 
X_train_scaled = scaler.fit_transform(X_train[col_scale])
X_test_scaled = scaler.fit_transform(X_test[col_scale])
#make it a df again
X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=col_scale,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,
    columns=col_scale,
    index=X_test.index
)

display(X_train_scaled)

In [None]:
#drop the original axes again
X_train = X_train.drop(col_scale, axis=1)
X_test = X_test.drop(col_scale, axis=1)
#and check if everything's still in order 
X_train.index.equals(X_train_scaled.index)
X_test.index.equals(X_test_scaled.index)

In [None]:
# put it back together again 
X_train_sp = pd.concat([X_train_scaled, X_train ], axis=1)
X_test_sp = pd.concat([X_test_scaled, X_test], axis=1)
#and check
print("Dff shape", dff.shape)
print("X_train shape", X_train_scaled.shape)
print("X_test shape", X_test_scaled.shape)
print("X_train shape after scaling", X_train_sp.shape)
print("X_test shape after scaling", X_test_sp.shape)
print("train split head:")
display(X_train_sp.head())
print("test split head:")
display(X_test_sp.head())