# EDA of the Kickstarter Dataset

from https://www.kaggle.com/datasets/ulrikthygepedersen/kickstarter-projects/data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("data/kickstarter_projects.csv")
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe().round(0)

In [None]:
# check for missing values
df.isna().sum()

In [None]:
# check for duplicates
df.duplicated().value_counts()

In [None]:
ax = sns.histplot(df, y='Category');

In [None]:
ax = sns.histplot(df, y='Country');

In [None]:
# closer look on country due to imbalance in dataset and potential implications on model
country_stats_pledged = df.groupby('Country').agg(count=('Pledged','size'), mean=('Pledged','mean'), median=('Pledged','median'), max=('Pledged','max'))
country_stats_pledged.astype('int').sort_values(by='count', ascending=False)

In [None]:
fig = plt.figure(figsize=(16, 8))
sns.histplot(df, y='State');

In [None]:
# scatterplot with log-scaled x-axis and y-axis
fig = plt.figure(figsize=(10, 10))
sns.scatterplot(x="Pledged", y="Goal", data=df, hue='State')
plt.xscale('log')
plt.yscale('log')
plt.show()

In [None]:
# check for summary statistics
fig = plt.figure(figsize=(18, 8))
sns.boxplot(data=df, y='Pledged', x='Category');
plt.yscale('log')
plt.show()

In [None]:
df.groupby('State')['Goal'].describe().style.format("{:.0f}")

In [None]:
df.groupby('Category')['Pledged'].describe().style.format("{:.0f}")

In [None]:
mean_goal = df.groupby(['Category', 'State'])['Goal'].mean().reset_index()

# create pivot table to prepare data for plotting
pivot_table = mean_goal.pivot(index='Category', columns='State', values='Goal')

# plot stacked bar chart
ax = pivot_table.plot(kind='bar', stacked=True, figsize=(10, 6))

# set x-axis label
ax.set_xlabel('Category')

# set y-axis label
ax.set_ylabel('Mean time in days')

# set chart title
ax.set_title('Staked Barchart for States vs Category')

plt.show()

In [None]:
avg_goal_by_category = df.groupby('Category')['Goal'].mean().reset_index()
fig = plt.figure(figsize=(18, 8))

# bar plot with Seaborn
sns.barplot(x='Category', y='Goal', data=avg_goal_by_category)

plt.title('Average Goal by Category')
plt.xlabel('Category')
plt.ylabel('Average Goal')

plt.show()

In [None]:
avg_pledged_by_category = df.groupby('Category')['Pledged'].mean().reset_index()
fig = plt.figure(figsize=(18, 8))

# bar plot with Seaborn
sns.barplot(x='Category', y='Pledged', data=avg_pledged_by_category)

plt.title('Average Pledged by Category')
plt.xlabel('Category')
plt.ylabel('Average Pledged')

plt.show()

In [None]:
avg_backers_by_category = df.groupby('Category')['Backers'].mean().reset_index()
fig = plt.figure(figsize=(18, 8))

# bar plot with Seaborn
sns.barplot(x='Category', y='Backers', data=avg_backers_by_category)

plt.title('Average Backers by Category')
plt.xlabel('Category')
plt.ylabel('Average Backers')

plt.show()