# Libraries and Formulas

In [32]:
#import Libraries

import numpy as np #IFYKYK
import pandas as pd #IFYKYK
import matplotlib.pyplot as plt #visuals
import seaborn as sns #visuals
import sqlite3 #load data
from sqlite3 import Error #load data

#modelling Libraries
from sklearn.preprocessing import StandardScaler, FunctionTransformer, QuantileTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
from sklearn.compose import ColumnTransformer



In [2]:
#functions 

#load data
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

def execute_read_query(connection, query):
    cursor = connection.cursor()
    result = None
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        return result
    except Error as e:
        print(f"The error '{e}' occurred")


#declare variables
def declare_variables(dataframe, target):
    X = dataframe.drop(target, axis=1)
    y = dataframe[target]
    return X, y

#split data - stratify y_var
def split_data(X_var, y_var, testing_size):
    X_train, X_test, y_train, y_test = train_test_split(X_var, y_var, test_size=testing_size, stratify=y_var)
    return X_train, X_test, y_train, y_test


#eval models
def eval_models(actual_data, predicted_data, train_data=None):
    report = classification_report(actual_data, predicted_data)
    if train_data == True:
        print(f"Classification Report for the Train model:")
        print(report)
    else:
        print(f"Classification report for the Test model:")
        print(report)
    

# Load Data

In [3]:
#create connection to db
connection = create_connection('kickstarter.sqlite')

Connection to SQLite DB successful


In [4]:
#query db for data - US only because most of data
my_query = """SELECT Campaign_ID, Launch_Date, End_Date, Goal, Pledged, Num_Donors, Status, Category, Subcategory
            FROM Campaigns 
            INNER JOIN Categories ON Campaigns.Category_ID=Categories.Category_ID
            INNER JOIN Subcategories ON Campaigns.Subcategory_ID=Subcategories.Subcategory_ID
            INNER JOIN Countries ON Campaigns.Country_ID=Countries.Country_ID
            WHERE Project_Country == 'United States of America'"""

#store query results in df
data = pd.read_sql_query(my_query, connection)

data.head()

Unnamed: 0,Campaign_ID,Launch_Date,End_Date,Goal,Pledged,Num_Donors,Status,Category,Subcategory
0,1,2016-08-11 00:00:00,2016-10-10 00:00:00,100000000,0,0,Failed,Film & Video,Science Fiction
1,10,2017-08-14 00:00:00,2017-09-13 00:00:00,100000000,1,1,Failed,Film & Video,Film & Video
2,1000,2016-09-15 00:00:00,2016-11-14 00:00:00,2000000,10462,31,Failed,Design,Design
3,10000,2013-03-12 00:00:00,2013-04-14 00:00:00,220000,14128,69,Failed,Technology,Technology
4,100000,2015-05-18 00:00:00,2015-07-17 00:00:00,22000,22223,105,Successful,Film & Video,Narrative Film


# Data Preprocessing

In [5]:
#Calc Duration in Days
#new feature called Duration
data['Launch_Date'] = pd.to_datetime(data['Launch_Date']) #convert to dt
data['End_Date'] = pd.to_datetime(data['End_Date']) #convert to dt

#calc Duration
data['Duration (Days)'] = (data['End_Date'] - data['Launch_Date']).dt.days


#Chnage Status to 0 and 1 - Cancelled and suspended are considered Failed
status_dict = {'Failed': 0, 'Canceled': 0, 'Suspended': 0, 'Successful': 1}

data['Status'] = data['Status'].replace(status_dict)

#drop Campaign_ID, Launch and End Dates, Category and Subcategory
data.drop(['Campaign_ID', 'Launch_Date', 'End_Date', 'Category', 'Subcategory'], axis=1, inplace=True)


# Declaring Variables and Splitting

In [6]:
#declare variables
X, y = declare_variables(data, 'Status')

#split data y is stratified in the function
X_train, X_test, y_train, y_test = split_data(X, y, testing_size=0.3, shuffle=True)

# Logistic Regression

## Raw Data only - no Transformations

In [8]:
#declare pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

In [9]:
#fit the pipeline
pipeline.fit(X_train, y_train)

In [10]:
#cross validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [11]:
#eval cross val
print("Cross-Validation Scores:")
for fold, score in enumerate(cv_scores, start=1):
    print(f"Fold {fold}: {score:.4f}")

Cross-Validation Scores:
Fold 1: 0.8725
Fold 2: 0.8694
Fold 3: 0.8737
Fold 4: 0.8724
Fold 5: 0.8748


Cross val scores are pretty good, let's expand on the model

In [12]:
#results on train and test data
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

In [16]:
#eval train model
eval_models(y_train, train_pred, train_data=True)

Classification Report for the Train model:
              precision    recall  f1-score   support

           0       0.85      0.96      0.91    156930
           1       0.93      0.75      0.83    102430

    accuracy                           0.88    259360
   macro avg       0.89      0.86      0.87    259360
weighted avg       0.88      0.88      0.88    259360



In [17]:
#eval test model
eval_models(y_test, test_pred, train_data=False)

Classification report for the Test model:
              precision    recall  f1-score   support

           0       0.86      0.96      0.91     67256
           1       0.93      0.75      0.83     43899

    accuracy                           0.88    111155
   macro avg       0.89      0.86      0.87    111155
weighted avg       0.89      0.88      0.88    111155



Mhmm, the results are almost identical

## Logistic Regression + Data transformation (no Binning)

In [21]:
#transform data - use function and column transformer

#initalize quantiletransformer
qt = QuantileTransformer(output_distribution='normal')

#functions to use

#log transform and account for 0
def log_transform(dataframe):
    log_value = np.log1p(dataframe)
    round_log = log_value.round(0)
    return round_log

#Quantile Transformer
def quantile_transformer(dataframe):
    new_data = qt.fit_transform(dataframe)
    return new_data

#.values.reshape(-1, 1) dont need this as part of quantile transformer

#Square root
def square_root(dataframe):
    my_root = np.sqrt(dataframe).round(2)
    return my_root


let's transform some of the numerical data and reassess the heatmap
- Log for Goal
- log and quantile transformer for pledged
- aquare root and log for Num_donors
- quantile transformer Duration

In [23]:
#define transformations
my_transformations = [
    ('log_goal', FunctionTransformer(func=log_transform), ['Goal']),
    ('log_pledge', FunctionTransformer(func=log_transform), ['Pledged']),
    ('sqrt_donors', FunctionTransformer(func=square_root), ['Num_Donors']),
    ('quantile_transform', FunctionTransformer(func=quantile_transformer), ['Goal', 'Pledged', 'Duration (Days)']),
    ('log_donors', FunctionTransformer(func=log_transform), ['Num_Donors'])
]

#create columntransformer
ct = ColumnTransformer(transformers=my_transformations)


In [24]:
#pipeline with transformations
pipeline = Pipeline(steps=[
  ('column transformer', ct),
  ('scaler', StandardScaler()),
  ('model', LogisticRegression())  
])

In [25]:
#fit pipeline
pipeline.fit(X_train, y_train)

In [26]:
#cross val on the data and model
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [27]:
#eval cross val
print("Cross-Validation Scores:")
for fold, score in enumerate(cv_scores, start=1):
    print(f"Fold {fold}: {score:.4f}")

Cross-Validation Scores:
Fold 1: 0.9880
Fold 2: 0.9895
Fold 3: 0.9885
Fold 4: 0.9882
Fold 5: 0.9887


Cross validation scores increased

In [28]:
#get prediction
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

In [29]:
#eval train
eval_models(y_train, train_pred, train_data=True)

Classification Report for the Train model:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    156930
           1       0.97      1.00      0.98    102430

    accuracy                           0.99    259360
   macro avg       0.98      0.99      0.99    259360
weighted avg       0.99      0.99      0.99    259360



In [33]:
balanced_acc = balanced_accuracy_score(y_train, train_pred)
print(f"Balanced Accuracy: {balanced_acc:.4f}")

Balanced Accuracy: 0.9896


In [31]:
#eval test data
eval_models(y_test, test_pred)

Classification report for the Test model:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     67256
           1       0.97      1.00      0.98     43899

    accuracy                           0.99    111155
   macro avg       0.98      0.99      0.99    111155
weighted avg       0.99      0.99      0.99    111155



In [34]:
balanced_acc = balanced_accuracy_score(y_test, test_pred)
print(f"Balanced Accuracy: {balanced_acc:.4f}")

Balanced Accuracy: 0.9887
