In [3]:
#Import libraries
import numpy as np
import pandas as pd
import zipfile
import matplotlib as plt
import seaborn as sns

In [4]:
#unzip folder and create dict

with zipfile.ZipFile('Kickstarter+Projects.zip') as folder:
    dfs = {text_file.filename: pd.read_csv(folder.open(text_file.filename)) for text_file in folder.infolist() if text_file.filename.endswith('.csv')}

In [5]:
#get data from dict into dataframe

ks_data = pd.DataFrame.from_dict(dfs['kickstarter_projects.csv'])


In [1]:
#Import functions
%run functions.ipynb

Data Understanding:
We want to understand the size, what features are in the dataset, the datatypes, and summary statistics(if applicable), Identify any nulls and duplicates as well
Note: It is best to copy the df first and keep the raw data separate

In [42]:
#make copy of raw data
data_copy = ks_data.copy()

In [7]:
#check shape
df_shape(data_copy)

(374853, 11)

In [8]:
get_info(data_copy)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374853 entries, 0 to 374852
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ID           374853 non-null  int64 
 1   Name         374853 non-null  object
 2   Category     374853 non-null  object
 3   Subcategory  374853 non-null  object
 4   Country      374853 non-null  object
 5   Launched     374853 non-null  object
 6   Deadline     374853 non-null  object
 7   Goal         374853 non-null  int64 
 8   Pledged      374853 non-null  int64 
 9   Backers      374853 non-null  int64 
 10  State        374853 non-null  object
dtypes: int64(4), object(7)
memory usage: 31.5+ MB


In [10]:
get_head(data_copy, 5)

Unnamed: 0,ID,Name,Category,Subcategory,Country,Launched,Deadline,Goal,Pledged,Backers,State
0,1860890148,Grace Jones Does Not Give A F$#% T-Shirt (limi...,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed
1,709707365,CRYSTAL ANTLERS UNTITLED MOVIE,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed
2,1703704063,drawing for dollars,Art,Illustration,United States,2009-04-24 21:52:03,2009-05-03,20,35,3,Successful
3,727286,Offline Wikipedia iPhone app,Technology,Software,United States,2009-04-25 17:36:21,2009-07-14,99,145,25,Successful
4,1622952265,Pantshirts,Fashion,Fashion,United States,2009-04-27 14:10:39,2009-05-26,1900,387,10,Failed


In [11]:
describe_df(data_copy)

Unnamed: 0,ID,Goal,Pledged,Backers
count,374853.0,374853.0,374853.0,374853.0
mean,1074656000.0,45863.78,9121.073,106.690359
std,619137700.0,1158778.0,91320.54,911.71852
min,5971.0,0.0,0.0,0.0
25%,538072800.0,2000.0,31.0,2.0
50%,1075300000.0,5500.0,625.0,12.0
75%,1610149000.0,16000.0,4051.0,57.0
max,2147476000.0,166361400.0,20338990.0,219382.0


In [12]:
check_dups(data_copy)

'There are 0 duplicates in your dataframe'

In [13]:
check_nulls(data_copy)

ID             0
Name           0
Category       0
Subcategory    0
Country        0
Launched       0
Deadline       0
Goal           0
Pledged        0
Backers        0
State          0
dtype: int64

From understanding the data, I know that I have 11 features with almost 375k rows, some of my features are not the correct datatype for analysis, and what a sample of the data looks like. There are 0 nulls and duplicates as well. In addition, I can tell there quite a range for Goal, Pledged, and Backers so I will have to deal with these somehow
This will provide me with an idea of how to transform the data so it has more use

Data Cleaning: As there are no duplicates or nulls, we will focus on changing data types

In [43]:
#change ID to string
data_copy['ID'] = data_copy['ID'].astype(str)

In [52]:
#change dates from strings and keep only year, month, day
data_copy['Launched'] = pd.to_datetime(data_copy['Launched'])
data_copy['Deadline'] = pd.to_datetime(data_copy['Deadline'])

In [53]:
get_info(data_copy)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374853 entries, 0 to 374852
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   ID           374853 non-null  object        
 1   Name         374853 non-null  object        
 2   Category     374853 non-null  object        
 3   Subcategory  374853 non-null  object        
 4   Country      374853 non-null  object        
 5   Launched     374853 non-null  datetime64[ns]
 6   Deadline     374853 non-null  datetime64[ns]
 7   Goal         374853 non-null  int64         
 8   Pledged      374853 non-null  int64         
 9   Backers      374853 non-null  int64         
 10  State        374853 non-null  object        
dtypes: datetime64[ns](2), int64(3), object(6)
memory usage: 31.5+ MB


In [55]:
get_head(data_copy, 5)

Unnamed: 0,ID,Name,Category,Subcategory,Country,Launched,Deadline,Goal,Pledged,Backers,State
0,1860890148,Grace Jones Does Not Give A F$#% T-Shirt (limi...,Fashion,Fashion,United States,2009-04-21,2009-05-31,1000,625,30,Failed
1,709707365,CRYSTAL ANTLERS UNTITLED MOVIE,Film & Video,Shorts,United States,2009-04-23,2009-07-20,80000,22,3,Failed
2,1703704063,drawing for dollars,Art,Illustration,United States,2009-04-24,2009-05-03,20,35,3,Successful
3,727286,Offline Wikipedia iPhone app,Technology,Software,United States,2009-04-25,2009-07-14,99,145,25,Successful
4,1622952265,Pantshirts,Fashion,Fashion,United States,2009-04-27,2009-05-26,1900,387,10,Failed


We have now changed the data into more usable data types.

Data exploration/summarization