# Libraries and Formulas

In [1]:
#import Libraries

import numpy as np #IFYKYK
import pandas as pd #IFYKYK
import matplotlib.pyplot as plt #visuals
import seaborn as sns #visuals
import sqlite3 #load data
from sqlite3 import Error #load data

#modelling Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [12]:
#functions 

#load data
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

def execute_read_query(connection, query):
    cursor = connection.cursor()
    result = None
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        return result
    except Error as e:
        print(f"The error '{e}' occurred")


#declare variables
def declare_variables(dataframe, target):
    X = dataframe.drop(target, axis=1)
    y = dataframe[target]
    return X, y

#split data - stratify y_var
def split_data(X_var, y_var, testing_size):
    X_train, X_test, y_train, y_test = train_test_split(X_var, y_var, test_size=testing_size, stratify=y_var)
    return X_train, X_test, y_train, y_test


#eval models
def eval_models(actual_data, predicted_data):
    

# Load Data

In [3]:
#create connection to db
connection = create_connection('kickstarter.sqlite')

Connection to SQLite DB successful


In [28]:
#query db for data - US only because most of data
my_query = """SELECT Campaign_ID, Launch_Date, End_Date, Goal, Pledged, Num_Donors, Status, Category, Subcategory
            FROM Campaigns 
            INNER JOIN Categories ON Campaigns.Category_ID=Categories.Category_ID
            INNER JOIN Subcategories ON Campaigns.Subcategory_ID=Subcategories.Subcategory_ID
            INNER JOIN Countries ON Campaigns.Country_ID=Countries.Country_ID
            WHERE Project_Country == 'United States of America'"""

#store query results in df
data = pd.read_sql_query(my_query, connection)

data.head()

Unnamed: 0,Campaign_ID,Launch_Date,End_Date,Goal,Pledged,Num_Donors,Status,Category,Subcategory
0,1,2016-08-11 00:00:00,2016-10-10 00:00:00,100000000,0,0,Failed,Film & Video,Science Fiction
1,10,2017-08-14 00:00:00,2017-09-13 00:00:00,100000000,1,1,Failed,Film & Video,Film & Video
2,1000,2016-09-15 00:00:00,2016-11-14 00:00:00,2000000,10462,31,Failed,Design,Design
3,10000,2013-03-12 00:00:00,2013-04-14 00:00:00,220000,14128,69,Failed,Technology,Technology
4,100000,2015-05-18 00:00:00,2015-07-17 00:00:00,22000,22223,105,Successful,Film & Video,Narrative Film


# Data Preprocessing

In [29]:
#Calc Duration in Days
#new feature called Duration
data['Launch_Date'] = pd.to_datetime(data['Launch_Date']) #convert to dt
data['End_Date'] = pd.to_datetime(data['End_Date']) #convert to dt

#calc Duration
data['Duration (Days)'] = (data['End_Date'] - data['Launch_Date']).dt.days


#Chnage Status to 0 and 1 - Cancelled and suspended are considered Failed
status_dict = {'Failed': 0, 'Canceled': 0, 'Suspended': 0, 'Successful': 1}

data['Status'] = data['Status'].replace(status_dict)

#drop Campaign_ID, Launch and End Dates, Category and Subcategory
data.drop(['Campaign_ID', 'Launch_Date', 'End_Date', 'Category', 'Subcategory'], axis=1, inplace=True)


# Declaring Variables and Splitting

In [30]:
#declare variables
X, y = declare_variables(data, 'Status')

#split data y is stratified in the function
X_train, X_test, y_train, y_test = split_data(X, y, testing_size=0.3)

# Logistic Regression

## Raw Data only - no Transformations

In [35]:
#declare pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

In [36]:
#fit the pipeline
pipeline.fit(X_train, y_train)

In [37]:
#cross validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [38]:
#eval cross val
print("Cross-Validation Scores:")
for fold, score in enumerate(cv_scores, start=1):
    print(f"Fold {fold}: {score:.4f}")

Cross-Validation Scores:
Fold 1: 0.8751
Fold 2: 0.8730
Fold 3: 0.8724
Fold 4: 0.8781
Fold 5: 0.8762
