<span style="font-size: 20px;"><b>This is the <span style="font-size: 30px;">Preprocessing</span> part of the file.</b></span>

<b>Importing and loading necessary files and packages</b>

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

df = pd.read_csv("../data/processed/01_eda_cleaned.csv")
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launch_year,launch_month
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015,8
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017,9
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012,3
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015,7


<b>Handling Missing Values</b>

In [28]:
df.dropna(subset=["name", "category", "main_category", "state"], inplace=True)

In [30]:
# Convert dates to datetime
df['deadline'] = pd.to_datetime(df['deadline'], errors='coerce')
df['launched'] = pd.to_datetime(df['launched'], errors='coerce')

<b>Feature Engineering</b>

In [33]:
#create campaign duration
df["campaign_days"] = (df["deadline"] - df["launched"]).dt.days

In [35]:
#target variable: success/fail (binary)
df = df[df["state"].isin(["successful", "failed"])]
df["target"] = df["state"].map({"successful":1, "failed":0})

<b>Encoding Categorical Variable</b>

In [38]:
#label encoding
label_enc_cols = ["category", "main_category", "currency", "country"]
label_encoders = {}
for col in label_enc_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

<b>Scaling Numerical Features</b>

In [41]:
num_features = ["goal", "pledged", "backers", "usd_pledged_real", "usd_goal_real", "campaign_days"]
scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

<b>Train Test Split</b>

In [44]:
X = df.drop(columns=["ID", "name", "state", "target", "deadline", "launched"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [46]:
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (232170, 13)
Test shape: (99502, 13)


<b>Save Processed Data</b>

In [49]:
X_train.to_csv("../data/processed/X_train.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

<b>This concludes the Preprocessing part of the project.</b>