In [None]:
#initial imports 
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import seaborn as sns
import matplotlib.pyplot as plt

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Get the absolute path of the current file/notebook
# If using Jupyter, use Path.cwd(). If using a .py script, use Path(__file__).parent
curr_dir = Path.cwd()

# Calculate the project root (adjust '.parent' count as needed)
# If your notebook is in 'project/notebooks/', the root is 1 level up
project_root = curr_dir.parent.parent 

# Add project root to system path so Python can find 'utils'
sys.path.append(str(project_root))

print(f"Project Root added to path: {project_root}")

from utils.feature_engineer_df import build_features 

#for the scaling and encoding 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#cleanup 

pd.set_option('display.max_columns', None)

In [None]:
#get your data from our utils
build_features(
    input_path=Path.cwd().resolve().parents[1] / "data" / "cleaned" / "kickstarter_cleaned.csv",
    output_path=Path.cwd().resolve().parents[1] / "data" / "feature" / "kickstarter_featured.csv",
    raw_path=Path.cwd().resolve().parents[1] / "data" / "raw" / "ks-projects-201801.csv",
    logger=logger
)

# Load Files as DataFrames
BASE_DIR = Path.cwd().resolve().parents[1]
data_file = BASE_DIR / "data" / "feature" / "kickstarter_featured.csv"

filepath = Path(data_file)

df = pd.read_csv(filepath, encoding='latin-1', low_memory=False)

In [None]:
#list of columns to "hard drop" from feature engineering dataframe
columns_to_drop = ['id', #irrelevant
                   'main_category', #substituted in a satisfactory way
                   'deadline', 'launched', #created new categories 
                   'backers', 'usd_pledged_real', 'usd_pledged_bins', 'backers_per_pledged', 'backer_pledged_bins', 'pledged_per_category', #everything to do with "future information"
                   'launched_year', 'deadline_year', #info about the past and not seasonal
                   ]
# drop them
dfc = df.drop(columns=columns_to_drop)

In [None]:
df_sl = pd.get_dummies(dfc['launch_season'], prefix = 'sl_', drop_first=True, dtype=int)
df_sd = pd.get_dummies(dfc['deadline_season'], prefix = 'sd_', drop_first=True, dtype=int)
df_cat = pd.get_dummies(dfc['main_category_grouped'], prefix = 'cat_', drop_first=True, dtype=int)
df_co = pd.get_dummies(dfc['continent'], prefix = 'co_', drop_first=True, dtype=int)
df_ugb = pd.get_dummies(dfc['usd_goal_bins'], prefix = 'ugb_', drop_first=True, dtype=int)
df_cgp = pd.get_dummies(dfc['category_goal_percentile'], prefix = 'cgp_', drop_first=True, dtype=int)
df_db = pd.get_dummies(dfc['duration_bins'], prefix = 'db_', drop_first=True, dtype=int)
#put everything back together again: 
dff = pd.concat([dfc, df_sl, df_sd, df_cat, df_co, df_ugb, df_db, df_cgp], axis=1)

## General thoughts

For this prediction project I am doing binary classification: 
* Positive class = successful

* Negative class = failed

Risks of FP: waste of invested time and money

Risks of FN: waste of talent and lost of great project and potential

* Accuracy - for baseline and generall correctness
* Precision - how many TP, highlights the importance of FP
* Recall - importance of FN
* F1 - balance of Precision and Recall

| My goal                               | My metric |
| ---------------------------------------------------- | --------------- |
| **..to avoid launching weak and doomed projects**  | **Precision**   |
| **..to find as many good ideas as possible** | **Recall**      |
| **..to balance both**          | **F1-score**    |

In [None]:

dff = dff.drop(columns=['main_category_grouped', 'continent', 'launch_season', 'deadline_season', 'category_goal_percentile', 'duration_bins', 'usd_goal_bins', 'country'])
dff.head(2)

In [None]:
#first, create our dfs 
X = dff.drop(columns=['target'])        #independent variable
y = dff['target']                       #dependent
#get train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
#stratify ensures target classes are balanced in train and test sets
print("Df before", dff.shape)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

| Element   | Shape          | Meaning                                         |
| --------- | -------------- | ----------------------------------------------- |
| `dff`     | `(293019, 33)` | Original full dataset: 293,019 rows, 33 columns |
| `X_train` | `(205113, 32)` | 70% of rows used for training, 32 features      |
| `X_test`  | `(87906, 32)`  | 30% of rows for testing, same 32 features       |
| `y_train` | `(205113,)`    | Target values for training set                  |
| `y_test`  | `(87906,)`     | Target values for test set                      |


Uses a stratified split (to preserve class distribution). Output confirms dimensions before and after the split:

Df before (293019, 33)

* X_train shape (205113, 32)
* X_test shape (87906, 32)
* y_train shape (205113,)
* y_test shape (87906,)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#training my model
clf = DecisionTreeClassifier(random_state=42)   #clf = a variable name for the model (short for classifier)    
#random state = every time I run the code, I get the same tree structure and results
clf.fit(X_train, y_train) #this is where the model starts to learn
#X_train: the input features (32 columns) and y_train: the correct labels (success / failure)  - after this step the model is trained

#predict
y_pred = clf.predict(X_test)        #The trained model is now used on unseen data (X_test)

#A DT is created, I trained it on historical data and used it to predict whether new projects would succeed or fail

In [None]:
#additional lore: it removes the first three numbers after the zero

#acc_score = accuracy_score(y_test, y_pred)
#prec_score = precision_score(y_test, y_pred)
#rec_score = recall_score(y_test, y_pred)
#f_score = f1_score(y_test, y_pred)
#print(f"Accuracy score: {acc_score:.3f}")
#print(f"Precision score: {prec_score:.3f}")
#print(f"Recall score: {rec_score:.3f}")
#print(f"F1 score: {f_score:.3f}")

In [None]:
#baseline performance doesn't look great
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
#Grid search cv does his best to find the best combination of hyperparameters by testing different options through cross validation
from sklearn.model_selection import GridSearchCV
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1_score': 'f1'
}

param_grid = {
    'max_depth': [3, 5, 10, None],                          #limit the depth of the tree, prevents overfitting if too deep
    'min_samples_split': [2, 5, 10],                        #minimum samples to split a node
    'min_samples_leaf': [1, 2, 4]                           #minimum samples at a leaf node, helps smooth predictions and reduce noise
}                                                           #GridSearchCV will try all combinations of these values
# Set up the grid search
grid_accuracy = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),      #base estimator (for DT)
    param_grid=param_grid,                                  #search space
    scoring=scoring,                                        #multiple metrics
    refit='accuracy',                                       #selects the model with the highest accuracy
    cv=5,                                                   #5-fold cross-validation
    n_jobs=-1                                               #Use all CPU cores for faster computation
)

grid_recall = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),      
    param_grid=param_grid,                                     
    scoring=scoring,                                      
    refit='recall',                                         #useful when missing successes is costly           
    cv=5,                                              
    n_jobs=-1                                                  
)

grid_precision = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),      
    param_grid=param_grid,                                    
    scoring=scoring,                                     
    refit='precision',                                      #useful when false positives are costly                                   
    cv=5,                                                 
    n_jobs=-1                                                  
)

grid_f1 = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),      
    param_grid=param_grid,                                     
    scoring=scoring,                                        
    refit='f1_score',                                       #often best for imbalanced datasets                                     
    cv=5,                                                 
    n_jobs=-1                                                   
)

#train and evaluate the model on each fold
#Try every combination of the parameters
#Select the best one based on metric

## cv=5: Cross-Validation

cv=5 tells GridSearchCV to use 5-fold cross-validation.

Cross-validation is a technique for evaluating how well the model generalizes to new, unseen data.

The training data is split into 5 equal parts (folds). The model is trained on 4 folds and tested on the remaining fold. This process repeats 5 times, each time using a different fold as the validation set. The results (accuracy, recall) are averaged to give a more reliable estimate of model performance. This avoids the risk of getting lucky or unlucky with a single train/test split.

- Why it matters:

Gives a more robust evaluation than a single split. Helps avoid overfitting or underfitting during hyperparameter tuning.

## n_jobs=-1: CPU Core Usage

n_jobs=-1 tells GridSearchCV to use all available CPU cores to run faster. 

Computer's processor (CPU) has multiple cores. Each core can run a task in parallel with others.

- Why it matters:

GridSearchCV needs to train many models with different hyperparameters. Running these one-by-one is slow. With n_jobs=-1, Python parallelizes the work: Trains multiple models at the same time. Saves a lot of time, especially on large datasets

Setting n_jobs=-1 is a best practice when tuning models with scikit-learn.

In [None]:
grid_accuracy.fit(X_train, y_train)
grid_precision.fit(X_train, y_train)
grid_recall.fit(X_train, y_train)
grid_f1.fit(X_train, y_train)
print("Best Params:", grid_accuracy.best_params_)
print("Best Params:", grid_precision.best_params_)
print("Best Params:", grid_recall.best_params_)
print("Best Params:", grid_f1.best_params_)

Each .fit() call:

- Runs a grid search over all combinations of hyperparameters.

- Uses cross-validation (5 folds).

- Evaluates each model using accuracy, precision, recall, and F1-score.

- Selects the best model based on the metric I choose.

- Stores the best model in .best_estimator_.

## Interpretation

If the goal is:

- Reliability of success predictions → Accuracy/precision model.

- Catching all successes → Recall model.

- Balance → F1 model.

In [None]:
#use the best model found
best_accuracy_clf = grid_accuracy.best_estimator_
best_precision_clf = grid_precision.best_estimator_
best_recall_clf = grid_recall.best_estimator_
best_f1_clf = grid_f1.best_estimator_

In [None]:
y_pred_acc = best_accuracy_clf.predict(X_test)
y_pred_prec = best_precision_clf.predict(X_test)
y_pred_rec = best_recall_clf.predict(X_test)
y_pred_f1 = best_f1_clf.predict(X_test)

print("Accuracy Model:", accuracy_score(y_test, y_pred_acc))
print("Precision Model:", precision_score(y_test, y_pred_prec))
print("Recall Model:", recall_score(y_test, y_pred_rec))
print("F1 Model:", f1_score(y_test, y_pred_f1))

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(best_f1_clf, filled=True, feature_names=X_train.columns, max_depth=4, fontsize=10)
plt.title("Simplified Decision Tree Structure (First 5 Levels)")
plt.show()

#each box is a decision node or a leaf:
#split condition (usd_goal_real <= 15002.375)
#gini: gini impurity (0 = pure, 0.5 = most impure)
#samples: number of data points that reached this node
#value = [class_0, class_1]: Count of samples per class (failed, successful)
#color: indicates purity and dominant class (blueish = more class 1, orange = more class 0)

## Interpretation
- usd_goal_real:	    

most influential split feature — low goals lead to better success chances

- duration_days:	    

shorter campaigns among low-goal projects increase likelihood of success

- db__4 weeks:	        

duration bin split matters for medium-to-high goal campaigns

- cat__Entertainment:	

certain categories perform differently — entertainment slightly more successful

In [None]:
#Confusion Matrices
       #Predicted
            #0     1
#Actual  0 |  TN |  FP |
        #1 |  FN |  TP |

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

#define a helper to show the confusion matrix
def show_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(title)
    plt.show()

#display for each optimized model
show_conf_matrix(y_test, y_pred_acc, "Confusion Matrix - Accuracy Optimized")
show_conf_matrix(y_test, y_pred_prec, "Confusion Matrix - Precision Optimized")
show_conf_matrix(y_test, y_pred_rec, "Confusion Matrix - Recall Optimized")
show_conf_matrix(y_test, y_pred_f1, "Confusion Matrix - F1 Optimized")


* More TP: better at finding successful projects

* More FP: more false alarms (bad projects that looked good)

* More FN: missed opportunities (good projects predicted as failures)

- FP ↓ = Precision ↑

- FN ↓ = Recall ↑

In [None]:
#Feature importance plot (from best model)
#here I assume best_f1_clf for demo
importances = best_f1_clf.feature_importances_
features = X_train.columns

#sort and plot
feat_imp = pd.Series(importances, index=features).sort_values(ascending=True).tail(15)
feat_imp.plot(kind='barh', figsize=(10, 6))
plt.title("Top 15 Feature Importances (F1-Optimized Decision Tree)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

#replace best_f1_clf with another model to analyze a different one.

#PRECISION
importances = best_precision_clf.feature_importances_
features = X_train.columns

#sort and plot
feat_imp = pd.Series(importances, index=features).sort_values(ascending=True).tail(15)
feat_imp.plot(kind='barh', figsize=(10, 6))
plt.title("Top 15 Feature Importances (Precision-Optimized Decision Tree)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()


#RECALL
importances = best_recall_clf.feature_importances_
features = X_train.columns

#sort and plot
feat_imp = pd.Series(importances, index=features).sort_values(ascending=True).tail(15)
feat_imp.plot(kind='barh', figsize=(10, 6))
plt.title("Top 15 Feature Importances (Recall-Optimized Decision Tree)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

#ACCURACY
importances = best_accuracy_clf.feature_importances_
features = X_train.columns

#sort and plot
feat_imp = pd.Series(importances, index=features).sort_values(ascending=True).tail(15)
feat_imp.plot(kind='barh', figsize=(10, 6))
plt.title("Top 15 Feature Importances (Accuracy-Optimized Decision Tree)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()


## Explanation

* feature_importances: gives the relative importance of each feature used by the tree in making splits.

* sort_values(ascending=True).tail(15): picks the 15 most important features, sorted from least to most.

* barh: horizontal bar chart makes it easy to compare feature impact.

## Interpretation

* Features like usd_goal_real, duration_days or specific one-hot-encoded season/category features seemed to be important.

* Differences across models (F1 vs. Precision) can tell which features contribute to recall vs. predictions.

## HOW CAN I IMPROVE MY MODEL?

1. Class weighting 

- for class imbalance

2. Reducing Features and trying both reduced and not-reduced features for modeling


## 1. Class imbalance 

 class_weight='balanced' 
 
 unbalanced classes can hurt the model

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)



In [None]:
y_pred_balanced = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Balanced Decision Tree Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("Precision:", precision_score(y_test, y_pred_balanced))
print("Recall:", recall_score(y_test, y_pred_balanced))
print("F1 Score:", f1_score(y_test, y_pred_balanced))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_balanced, cmap="Blues")
plt.title("Confusion Matrix - Balanced Decision Tree")
plt.show()


## Note
Class_weight for balancing made my model worse, it is an incorrect way to improve my model. 

* Apparently that happens when classes are not severely imbalanced

* The decision boundary becomes distorted

Let's try something different..

## 2. Feature selection


In [None]:
from sklearn.feature_selection import SelectFromModel

selector = SelectFromModel(best_f1_clf, prefit=True)
X_train_reduced = selector.transform(X_train)
X_test_reduced = selector.transform(X_test)

print("Original feature count:", X_train.shape[1])
print("Reduced feature count:", X_train_reduced.shape[1])


In [None]:
clf_reduced = DecisionTreeClassifier(random_state=42)
clf_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = clf_reduced.predict(X_test_reduced)

print("Accuracy:", accuracy_score(y_test, y_pred_reduced))
print("Precision:", precision_score(y_test, y_pred_reduced))
print("Recall:", recall_score(y_test, y_pred_reduced))
print("F1 Score:", f1_score(y_test, y_pred_reduced))


In [None]:
selected_features = X_train.columns[selector.get_support()]
print(selected_features.tolist())           #names of Kept Features

## Note:
This model works slightly worse, by removing less important or noisy features I might have oversimplified my model..

What if I apply RF and XGBoost on reduced features?

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_reduced, y_train)

y_pred_rf = rf_clf.predict(X_test_reduced)

print("Random Forest on Reduced Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))


In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_reduced, y_train)

y_pred_xgb = xgb_clf.predict(X_test_reduced)

print("XGBoost on Reduced Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))


What if on not-reduced, origital train set?

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)

print("Random Forest on Not-Reduced Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))

In [None]:
#xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
#xgb_clf.fit(X_train, y_train)

#y_pred_xgb = xgb_clf.predict(X_test_reduced)

#print("XGBoost on Not_Reduced Features:")
#print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
#print("Precision:", precision_score(y_test, y_pred_xgb))
#print("Recall:", recall_score(y_test, y_pred_xgb))
#print("F1 Score:", f1_score(y_test, y_pred_xgb))

#ValueError: feature_names must be string, and may not contain [, ] or
#the line xgb_clf.fit(X_train, y_train)  is the issue

## Final thoughts on my models:

Why did other models work slightly worse?

I explored the following model improvements:

- Class weighting

- Feature selection

- Ensemble models (Random Forest, XGBoost) - with and without reduced features


None of them surpassed the performance of the tuned decision tree in the beginning:

* Perhaps the feature space is relatively simple

* Decision trees already captured the important patterns

* Overcomplicating the model led to overfitting or minimal gain

## Summary

Careful feature engineering and grid search might matter more than model choice.

Model achieved a good balance between detecting successes and avoiding false positives.

## As a reminder, my first model (not the baseline!) results:
Accuracy Model: 0.64646326758128

Precision Model: 0.6177634130575307

Recall Model: 0.6988080722621743

F1 Model: 0.6193731942392096