## template for first exploration 
Scaling version 1 from 02_feature_engineering
used dummies for pure categoricals, kept numericals and scaled them using StandardScaler.

### CHANGE ME 
Let's plug in the final feature engineering in the following section 
(left in for having something to work with)

In [1]:
#initial imports 
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import seaborn as sns
import matplotlib.pyplot as plt

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Get the absolute path of the current file/notebook
# If using Jupyter, use Path.cwd(). If using a .py script, use Path(__file__).parent
curr_dir = Path.cwd()

# Calculate the project root (adjust '.parent' count as needed)
# If your notebook is in 'project/notebooks/', the root is 1 level up
project_root = curr_dir.parent.parent 

# Add project root to system path so Python can find 'utils'
sys.path.append(str(project_root))

print(f"Project Root added to path: {project_root}")

from utils.feature_engineer_df import build_features 

#for the scaling and encoding 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#cleanup 

pd.set_option('display.max_columns', None)

Project Root added to path: d:\Programming\ai_ds_bootcamp\ds-ml-project_kickstarters


In [2]:
#get your data from our utils
build_features(
    input_path=Path.cwd().resolve().parents[1] / "data" / "cleaned" / "kickstarter_cleaned.csv",
    output_path=Path.cwd().resolve().parents[1] / "data" / "feature" / "kickstarter_featured.csv",
    raw_path=Path.cwd().resolve().parents[1] / "data" / "raw" / "ks-projects-201801.csv",
    logger=logger
)

# Load Files as DataFrames
BASE_DIR = Path.cwd().resolve().parents[1]
data_file = BASE_DIR / "data" / "feature" / "kickstarter_featured.csv"

filepath = Path(data_file)

df = pd.read_csv(filepath, encoding='latin-1', low_memory=False)

INFO:__main__:Starting feature engineering pipeline
INFO:__main__:Loaded 331675 rows
INFO:__main__:Computing trending category feature...
INFO:__main__:Trending category feature computed: 64,724 projects (22.09%) belong to trending categories
INFO:__main__:Final columns before save: ['id', 'main_category', 'deadline', 'launched', 'backers', 'country', 'usd_pledged_real', 'usd_goal_real', 'duration_days', 'target', 'main_category_grouped', 'continent', 'launched_year', 'launched_month', 'deadline_year', 'deadline_month', 'launched_weekday', 'log_usd_goal', 'goal_per_day', 'usd_goal_bins', 'usd_pledged_bins', 'pledged_per_category', 'goal_per_category', 'category_goal_percentile', 'duration_bins', 'backers_per_pledged', 'backer_pledged_bins', 'launch_season', 'deadline_season', 'is_trending_category']
INFO:__main__:Saved engineered dataset to D:\Programming\ai_ds_bootcamp\ds-ml-project_kickstarters\data\feature\kickstarter_featured.csv


In [3]:
#list of columns to "hard drop" from feature engineering dataframe
columns_to_drop = ['id', #irrelevant
                   'main_category', #substituted in a satisfactory way
                   'deadline', 'launched', #created new categories 
                   'backers', 'usd_pledged_real', 'usd_pledged_bins', 'backers_per_pledged', 'backer_pledged_bins', 'pledged_per_category', #everything to do with "future information"
                   'launched_year', 'deadline_year', #info about the past and not seasonal
                   ]
# drop them
dfc = df.drop(columns=columns_to_drop)

Get dummies for pure categoricals:

In [4]:
#season launched, dropping first as it's multicollinear
df_sl = pd.get_dummies(df['launch_season'], prefix = 'sl_', drop_first=True, dtype=int)

#season deadline, dropping first
df_sd = pd.get_dummies(df['deadline_season'], prefix = 'sd_', drop_first=True, dtype=int)

#main category_grouped, dropping first
df_cat = pd.get_dummies(df['main_category_grouped'], prefix = 'cat_', drop_first=True, dtype=int)

#continent, dropping first
df_co = pd.get_dummies(df['continent'], prefix = 'co_', drop_first=True, dtype=int)

#put everything back together again: 
dff = pd.concat([dfc, df_sl, df_sd, df_cat, df_co], axis=1)


In [5]:
#let's soft-drop everything we just encoded
#commented out all the keepers 
columns_to_softdrop = ['country', #we kept it for comparison
                       # 'usd_goal_real', #right now I want to try scaling actual values 
                       #'duration_days', #I want to scale these and drop the bins instead for now 
                       # 'target', (obviously)
                       'main_category_grouped', 'continent', #after creating dummies, get rid of these!
                        'launched_month', 'deadline_month', #because we have season but might want to look closer
                        'usd_goal_bins', #using category_goal_percentile (those two are redundant)
                        #'goal_per_category', #it's a polynomial feature - not independent but that's probably ok
                       'category_goal_percentile', #it's an orinal bin so keeping 'goal per category' instead
                       'duration_bins', #want to use actual values instead, using duration_days
                       'launch_season', 'deadline_season', #gotten dummies 
                       #'duration_bins_coded', #dropped the whole encoding code 
                       ]
# keeping the already dummied ones obviously 

In [6]:
df_to_scale = dff.drop(columns=columns_to_softdrop, axis=1)
display(df_to_scale.columns)

Index(['usd_goal_real', 'duration_days', 'target', 'launched_weekday',
       'log_usd_goal', 'goal_per_day', 'goal_per_category',
       'is_trending_category', 'sl__Spring', 'sl__Summer', 'sl__Winter',
       'sd__Spring', 'sd__Summer', 'sd__Winter', 'cat__Creative',
       'cat__Entertainment', 'cat__Other', 'cat__Tech', 'co__Europe',
       'co__North America', 'co__Oceania'],
      dtype='object')

Scale the remaining numerical columns 

In [7]:
#first, create our dfs 
X = df_to_scale.drop(columns=['target'])
y = df_to_scale['target']
#get train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
print("Df before", df_to_scale.shape)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

Df before (293019, 21)
X_train shape (205113, 20)
X_test shape (87906, 20)
y_train shape (205113,)
y_test shape (87906,)


In [8]:
#get the columns to standardise 
col_scale = ['usd_goal_real',
             'duration_days',
             'goal_per_category',
             ]

#instantiate
scaler = StandardScaler()
#scale 
X_train_scaled = scaler.fit_transform(X_train[col_scale])
X_test_scaled = scaler.fit_transform(X_test[col_scale])
#make it a df again
X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=col_scale,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,
    columns=col_scale,
    index=X_test.index
)

In [9]:
#drop the original axes again
X_train = X_train.drop(col_scale, axis=1)
X_test = X_test.drop(col_scale, axis=1)
#and check if everything's still in order 
X_train.index.equals(X_train_scaled.index)
X_test.index.equals(X_test_scaled.index)

True

In [10]:
# put it back together again 
X_train_sp = pd.concat([X_train_scaled, X_train ], axis=1)
X_test_sp = pd.concat([X_test_scaled, X_test], axis=1)
#and check
print("Dff shape", dff.shape)
print("X_train shape", X_train_scaled.shape)
print("X_test shape", X_test_scaled.shape)
print("X_train shape after scaling", X_train_sp.shape)
print("X_test shape after scaling", X_test_sp.shape)
print("train split head:")
display(X_train_sp.head())
print("test split head:")
display(X_test_sp.head())

Dff shape (293019, 31)
X_train shape (205113, 3)
X_test shape (87906, 3)
X_train shape after scaling (205113, 20)
X_test shape after scaling (87906, 20)
train split head:


Unnamed: 0,usd_goal_real,duration_days,goal_per_category,launched_weekday,log_usd_goal,goal_per_day,is_trending_category,sl__Spring,sl__Summer,sl__Winter,sd__Spring,sd__Summer,sd__Winter,cat__Creative,cat__Entertainment,cat__Other,cat__Tech,co__Europe,co__North America,co__Oceania
57281,-0.006214,-0.310206,-0.844947,0,10.221978,0.340733,False,0,0,0,0,0,0,1,0,0,0,0,1,0
264757,0.020483,0.885107,-0.19276,0,10.819798,0.24044,False,0,1,0,0,0,0,0,0,0,0,0,1,0
28338,-0.032193,-0.947706,1.452947,2,8.631593,0.392345,False,0,1,0,0,1,0,0,0,1,0,0,1,0
150240,-0.037251,1.363232,-0.844947,3,7.202423,0.141224,False,0,0,1,0,0,1,1,0,0,0,0,1,0
291618,-0.028165,-0.310206,0.983668,5,9.105091,0.303503,False,0,0,0,0,0,0,0,1,0,0,0,1,0


test split head:


Unnamed: 0,usd_goal_real,duration_days,goal_per_category,launched_weekday,log_usd_goal,goal_per_day,is_trending_category,sl__Spring,sl__Summer,sl__Winter,sd__Spring,sd__Summer,sd__Winter,cat__Creative,cat__Entertainment,cat__Other,cat__Tech,co__Europe,co__North America,co__Oceania
74198,-0.030944,2.095838,0.98803,0,8.699681,0.144995,False,0,1,0,0,1,0,0,1,0,0,0,1,0
57285,-0.019671,2.095838,0.98803,4,9.830101,0.163835,False,0,0,0,0,0,1,0,1,0,0,1,0,0
94280,-0.033632,-0.309181,-1.039898,1,8.006701,0.26689,False,0,1,0,0,1,0,1,0,0,0,0,1,0
169955,-0.036054,-0.309181,-1.06739,3,5.693934,0.189798,False,0,1,0,0,1,0,1,0,0,0,0,0,1
224769,-0.028533,-0.309181,0.154996,2,9.070183,0.302339,False,0,0,0,0,0,0,0,1,0,0,1,0,0


## Ensemble Method: Weighed Averages


##### current feature engineering (CHANGE ME)

you can start using the train-test-split: 
* X_train_sp
* X_test_sp
* y_train
* y_test

### Final Features (CHANGE ME)
| Column | Data | Decision | Done |
|:--------:|:--------:|:--------:|:--------:|
|  Category  |  >150 Subcategories   |  included in main category   | ignore |
|main_category| 15 categories| checked, makes a difference| use but make even less granular|
|Topic|4 clusters: Tech, Entertaiment, Creative, Other||
|country|country of project|informative, but too many|reduced to continents|
|usd_pledged_real|redundant apparently|same as pledged | ignored|
|deadline| unclear! probably stated end date of kickstarter| used |ignored|
|state|used as target |removed all but "success and fail"|keep and create new column with numerical values|
|country|country of project|informative, but too many|reduced to continents|

### Weighed Average 

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score

In [12]:
#first, get our three models 
model1 = LogisticRegression(C = 0.01, solver= 'lbfgs', random_state = 42)
model2 = KNeighborsClassifier(n_neighbors=21, algorithm='auto', metric='minkowski', p=1, weights='uniform')
model3 = DecisionTreeClassifier(max_depth=3, min_samples_leaf = 1,  min_samples_split = 2, random_state = 42)
#model4 = 

#fit them individually 
model1.fit(X_train_sp,y_train)
model2.fit(X_train_sp,y_train)
model3.fit(X_train_sp,y_train)
#get individual predictions 
pred1 = model1.predict_proba(X_test_sp)
pred2 = model2.predict_proba(X_test_sp)
pred3 = model3.predict_proba(X_test_sp)
#returns array len(X_test) with predicted probability for pos / neg values 
#---- score it: 

acc1 = accuracy_score(y_train, model1.predict(X_train_sp))
acc2 = accuracy_score(y_train, model2.predict(X_train_sp))
acc3 = accuracy_score(y_train, model3.predict(X_train_sp))
#sum it up: 
acc_sum = acc1 + acc2 + acc3
# now it gets interesting: 
weight1 = acc1/acc_sum #which proportion of the outcome
weight2 = acc2/acc_sum #does each prediction
weight3 = acc3/acc_sum #actually have?
#then use those weights on the predictions to go for it: 
finalpred = (pred1*weight1 + pred2*weight2 + pred3*weight3)
#and max that again - as it's an array
finalpred = np.argmax(finalpred.round(0), axis=1)
(y_test == finalpred).sum() / len(finalpred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6380110572657156

In [13]:
#first, get our three models 
model1 = LogisticRegression(C = 0.01, solver= 'lbfgs', random_state = 42)
model2 = KNeighborsClassifier(n_neighbors=21, algorithm='auto', metric='minkowski', p=1, weights='uniform')
model3 = DecisionTreeClassifier(max_depth=3, min_samples_leaf = 1,  min_samples_split = 2, random_state = 42)
#model4 = 

#fit them individually 
model1.fit(X_train_sp,y_train)
model2.fit(X_train_sp,y_train)
model3.fit(X_train_sp,y_train)
#get individual predictions 
pred1 = model1.predict_proba(X_test_sp)
pred2 = model2.predict_proba(X_test_sp)
pred3 = model3.predict_proba(X_test_sp)
#returns array len(X_test) with predicted probability for pos / neg values 
#---- score it: 
#f1_1 = f1_score(y_test, pred1, average='weighted')

f1_1 = f1_score(y_train, model1.predict(X_train_sp), average='weighted')
f1_2 = f1_score(y_train, model2.predict(X_train_sp), average='weighted')
f1_3 = f1_score(y_train, model3.predict(X_train_sp), average='weighted')
#sum it up: 
f1n_sum = f1_1 + f1_2 + f1_3
# now it gets interesting: 
weight1 = f1_1/f1n_sum #which proportion of the outcome
weight2 = f1_2/f1n_sum #does each prediction
weight3 = f1_3/f1n_sum #actually have?
#then use those weights on the predictions to go for it: 
finalpred = (pred1*weight1 + pred2*weight2 + pred3*weight3)
#and max that again - as it's an array
finalpred = np.argmax(finalpred.round(0), axis=1)
(y_test == finalpred).sum() / len(finalpred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6380110572657156