In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
# Load Files as DataFrames
BASE_DIR = Path.cwd().resolve().parents[1]
data_file_1 = BASE_DIR / "data" / "raw" / "ks-projects-201612.csv"
data_file_2 = BASE_DIR / "data" / "raw" / "ks-projects-201801.csv"

filepath_1 = Path(data_file_1)
filepath_2 = Path(data_file_2)

df1 = pd.read_csv(filepath_1, encoding='latin-1', low_memory=False)
df2 = pd.read_csv(filepath_2, low_memory=False)

logger.info(f"Loaded {len(df1)} rows and {len(df1.columns)} columns")
logger.info(f"Loaded {len(df2)} rows and {len(df2.columns)} columns")

In [None]:
# delete spaces from column names
df1.columns = df1.columns.str.strip().str.replace(' ', '_').str.lower()
df2.columns = df2.columns.str.strip().str.replace(' ', '_').str.lower()

In [None]:
print(list(df1.columns))
print(list(df2.columns))

In [None]:
print(f"Shape of data 1: {df1.shape}")
print(f"Shape of data 2: {df2.shape}")


In [None]:
print(f"Data Types of data 1: \n{df1.dtypes}")

In [None]:
print(f"Data Types of data 2: \n{df2.dtypes}")

In [None]:
print(f"Null Amount of data 1: \n{df1.isnull().sum()}")

In [None]:
print(f"Null Amount of data 2: \n{df2.isnull().sum()}")

In [None]:
print(f"Percent of Null Amount of data 1: \n{(df1.isnull().sum() / len(df1) * 100).round(4)}")

In [None]:
print(f"Percent of Null Amount of data 2: \n{(df2.isnull().sum() / len(df2) * 100).round(4)}")


In [None]:
# Check sizes
print(f"Dataset 1: {df1.shape}")
print(f"Dataset 2: {df2.shape}")

# Check unique IDs
common_ids = set(df2['id']).intersection(set(df1['id']))
print(f"Common IDs: {len(common_ids)}")
print(f"Unique IDs in df1: {len(df1) - len(common_ids)}")
print(f"Unique IDs in df2: {len(df2) - len(common_ids)}")

In [None]:
# TAKE DATA from df2!!!

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df2.isna(), cbar=False)
plt.show()

In [None]:
# replace missing names with 'unknown'
df2["name"] = df2["name"].fillna("unknown")
print(f"Null Amount of data 2: \n{df2.isnull().sum()}")

In [None]:
# usd_pledged is not reliable , because we have usd_pledged_real
# drop usd_pledged
df2 = df2.drop(columns=["usd_pledged"])

In [None]:
# check duplicates in IDs
df2["id"].duplicated().sum()

In [None]:
df2.launched.head(10)

In [None]:
df2.deadline.head(10)

In [None]:
df1.deadline.head(10)

In [None]:
# deadline, launched - should be datetime types
df2["launched"] = pd.to_datetime(df2["launched"], errors="coerce")
df2["deadline"] = pd.to_datetime(df2["deadline"], errors="coerce")

df2[["launched", "deadline"]].info()


In [None]:
df2.deadline.head(10)

In [None]:
df2.launched.head(10)

In [None]:
# maybe add duration as a new column to see how long did the kickstarter took
df2["kickstarter_duration_days"] = (
    df2["deadline"] - df2["launched"]
).dt.days

df2["kickstarter_duration_days"].describe()

In [None]:
df2["state"].value_counts()

In [None]:
sns.countplot(y="state", data=df2)
plt.title("Project Outcome Distribution")
plt.show()

In [None]:
num_cols = [
    "goal",
    "usd_goal_real",
    "pledged",
    "usd_pledged_real",
    "backers",
    "kickstarter_duration_days",
]

df2[num_cols].describe()

In [None]:
df2.main_category.unique()

In [None]:
success_rate = (
    df2.assign(success=df2["state"] == "successful")
      .groupby("main_category")["success"]
      .mean()
      .sort_values()
)

success_rate.plot(kind="barh")
plt.title("Success Rate by Main Category")
plt.show()

In [None]:
sns.boxplot(
    x="state",
    y="kickstarter_duration_days",
    data=df2
)
plt.title("Campaign Duration vs Outcome")
plt.show()

In [None]:
country_success = (
    df2.assign(success=df2["state"] == "successful")
      .groupby("country")["success"]
      .mean()
      .sort_values(ascending=False)
)

country_success.head(15).plot(kind="bar")
plt.title("Top Countries by Success Rate")
plt.show()

In [None]:
numerical_df = df2.select_dtypes(include=['number'])

In [None]:
correlation_matrix = numerical_df.corr()
plt.figure(figsize=(8, 6)) # Set the size of the plot

sns.heatmap(
    correlation_matrix,
    annot=True,     # Show the correlation values on the heatmap
    cmap='coolwarm',# Choose a divergent color palette (blue for negative, red for positive)
    fmt=".2f",      # Format the annotation values to two decimal places
    linewidths=.5,  # Add lines between cells for clarity
    center=0        # Ensure the color scale is centered at 0 (no correlation)
)

plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Filter for only 'successful' and 'failed' states for this analysis (optional but common)
success_fail_df = df2[df2['state'].isin(['successful', 'failed'])]

# Create a new binary column: 1 if successful, 0 otherwise
success_fail_df['is_successful'] = np.where(success_fail_df['state'] == 'successful', 1, 0)

# Calculate the mean success rate per category
category_success_rate = success_fail_df.groupby('main_category')['is_successful'].mean().sort_values(ascending=False).reset_index()

plt.figure(figsize=(12, 7))
sns.barplot(
    x='is_successful',
    y='main_category',
    data=category_success_rate,
    palette='Spectral'
)
plt.title('Project Success Rate by Main Category')
plt.xlabel('Success Rate (Fraction of Projects)')
plt.ylabel('Main Category')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(y=df2['kickstarter_duration_days'])
plt.title(f'Box Plot of kickstarter_duration_days to Visualize Outliers')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(y=df2['backers'])
plt.title(f'Box Plot of backers to Visualize Outliers')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(y=df2['goal'])
plt.title(f'Box Plot of goal to Visualize Outliers')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(y=df2['pledged'])
plt.title(f'Box Plot of pledged to Visualize Outliers')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(y=df2['usd_pledged_real'])
plt.title(f'Box Plot of usd_pledged_real to Visualize Outliers')
plt.show()