In [1]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#load data from .tsv file, had to ignore an error, imported as str for ease and will alter later

ks_data = pd.read_csv('38050-0001-Data.tsv', sep='\t', dtype=str, encoding_errors='ignore')


# My Functions

In [32]:
#get portion of data
def df_head(dataframe):
    if not dataframe.head().empty:
        return dataframe.head()
    else:
        return "Your dataframe does not exist! Check your dataframe variable."

#get information
def df_info(dataframe):
    if not dataframe.empty:
        return dataframe.info() 
    else:
        return "There is an issue with your dataframe! Check your dataframe variable."

#get shape
def df_shape(dataframe):
    if dataframe.shape == (0,0):
        return "Your dataframe does not exist! Check your dataframe variable."
    else:
        return dataframe.shape

#decribe dataframe
def describe_df(dataframe):
    if not dataframe.empty:
        return dataframe.describe()
    else:
        return "Your dataframe does not exist! Check your dataframe variable."


#get columns
def get_columns(dataframe):
    list_length = len(dataframe.columns)
    if list_length >= 0:
        return dataframe.columns
    else:
        return "There are no columns in your dataframe! Check your dataframe variable."

#get column data
def column_data(dataframe, column_name: str):
    if column_name in get_columns(dataframe):
        return dataframe[column_name]
    else:
        return "Your dataframe or column does not exist! Check your dataframe variable and columns."

#check nulls
def check_nulls(dataframe):
    num_nulls = dataframe.isnull().sum()
    percent_nulls = (num_nulls / len(dataframe) * 100).round(2)
    if not dataframe.empty:
        if num_nulls.sum() > 0:
            print(f"Here are the number of nulls per column: \n{num_nulls}")
            print(f"Here are the percent of nulls per column: \n{percent_nulls}")
        else:
            return "There are no nulls in your data."
    else:
        return "Your dataframe does not exist! Check your dataframe variable."

#check value counts
def unique(dataframe):
    if not dataframe.empty:
        return f"Here are the number of unique values per columns:", dataframe.nunique()


#check for dups
def check_dups(dataframe):
    num_dups = dataframe.duplicated().sum()
    if not dataframe.empty:
        return f"There are {num_dups} duplicates in your dataframe"
    else:
        return "Your dataframe does not exist! Check your dataframe variable."
    
#drop dups
def drop_dups(dataframe):
    dups = check_dups(dataframe)
    if dups == 0:
        return dataframe
    else:
        return dataframe.drop_duplicates()


#Visuals 

def make_hist(dataframe, columns, num_rows, num_columns, fig_size):
    if not dataframe.empty:
       fig, axs = plt.subplots(num_rows, num_columns, figsize=fig_size)
       for i in range(num_rows):
           for j in range(num_columns):
               axs[i,j].hist(dataframe[columns[i * num_columns + j]])
               axs[i,j].set_title(columns[i * num_columns + j + 1])
    else:
        return "Your dataframe is empty. Check your dataframe variable"

#make boxplots
def make_boxplot(dataframe, columns, num_rows, num_columns, fig_size):
    if not dataframe.empty:
       fig, axs = plt.subplots(num_rows, num_columns, figsize=fig_size)
       for i in range(num_rows):
           for j in range(num_columns):
               axs[i,j].boxplot(dataframe[columns[i * num_columns + j]])
               axs[i,j].set_title(columns[i * num_columns + j + 1])
    else:
        return "Your dataframe is empty. Check your dataframe variable"

#make pairplot
def make_pairplot(dataframe):
    return sns.pairplot(dataframe)

#make heatmap
def make_heatmap(dataframe):
    correlation = dataframe.corr
    return sns.heatmap(correlation)

#make scatter plot
def make_scatter(dataframe, x_column, y_column, title: str, x_label: str, y_label: str):
    plt.scatter(x=dataframe[x_column], y=dataframe[y_column])
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    return plt.show()

# EDA

In [4]:
#make copy of raw data
ks_copy = ks_data.copy()

In [7]:
df_info(ks_copy)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506199 entries, 0 to 506198
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   CASEID                        506199 non-null  object
 1   NAME                          506199 non-null  object
 2   PID                           506199 non-null  object
 3   CATEGORY                      506199 non-null  object
 4   CATEGORY_ID                   506199 non-null  object
 5   SUBCATEGORY                   506199 non-null  object
 6   SUBCATEGORY_ID                506199 non-null  object
 7   PROJECT_PAGE_LOCATION_NAME    506199 non-null  object
 8   PROJECT_PAGE_LOCATION_STATE   506199 non-null  object
 9   PROJECT_PAGE_LOCATION_COUNTY  506199 non-null  object
 10  UID                           506199 non-null  object
 11  LAUNCHED_DATE                 506199 non-null  object
 12  DEADLINE_DATE                 506199 non-null  object
 13 

In [6]:
df_head(ks_copy)

Unnamed: 0,CASEID,NAME,PID,CATEGORY,CATEGORY_ID,SUBCATEGORY,SUBCATEGORY_ID,PROJECT_PAGE_LOCATION_NAME,PROJECT_PAGE_LOCATION_STATE,PROJECT_PAGE_LOCATION_COUNTY,...,LAUNCHED_DATE,DEADLINE_DATE,PROJECT_CURRENCY,GOAL_IN_ORIGINAL_CURRENCY,PLEDGED_IN_ORIGINAL_CURRENCY,GOAL_IN_USD,PLEDGED_IN_USD,BACKERS_COUNT,STATE,URL_NAME
0,1,MASKED BY ICPSR,2137925650,Film & Video,11,Science Fiction,301,London,England,Greater London,...,8/11/2016,10/10/2016,USD,100000000,0,"$100,000,000",$0,0,canceled,MASKED BY ICPSR
1,2,MASKED BY ICPSR,1501531085,Film & Video,11,Fantasy,296,Los Angeles,CA,Los Angeles,...,12/19/2019,2/14/2020,USD,100000000,85,"$100,000,000",$85,4,canceled,MASKED BY ICPSR
2,3,MASKED BY ICPSR,953415668,Technology,16,Software,51,Mexico,Baja California,Tijuana,...,3/1/2017,3/22/2017,MXN,100000000,10,"$5,219,374",$1,1,failed,MASKED BY ICPSR
3,4,MASKED BY ICPSR,1371386304,Publishing,18,Publishing,18,Columbus,OH,Franklin,...,6/4/2018,7/5/2018,USD,100000000,1,"$100,000,000",$1,1,canceled,MASKED BY ICPSR
4,5,MASKED BY ICPSR,1720842777,Art,1,Illustration,22,Toronto,ON,Toronto,...,5/1/2015,6/30/2015,CAD,100000000,0,"$80,610,122",$0,0,failed,MASKED BY ICPSR


In [5]:
#I will remove some columns like Name amd URL as the data is masked and some ID columns as the data is redundant

ks_copy = ks_copy.drop(['CASEID','NAME', 'PID', 'CATEGORY_ID', 'SUBCATEGORY_ID', 'UID', 'URL_NAME'], axis=1)

In [6]:
df_head(ks_copy)

Unnamed: 0,CATEGORY,SUBCATEGORY,PROJECT_PAGE_LOCATION_NAME,PROJECT_PAGE_LOCATION_STATE,PROJECT_PAGE_LOCATION_COUNTY,LAUNCHED_DATE,DEADLINE_DATE,PROJECT_CURRENCY,GOAL_IN_ORIGINAL_CURRENCY,PLEDGED_IN_ORIGINAL_CURRENCY,GOAL_IN_USD,PLEDGED_IN_USD,BACKERS_COUNT,STATE
0,Film & Video,Science Fiction,London,England,Greater London,8/11/2016,10/10/2016,USD,100000000,0,"$100,000,000",$0,0,canceled
1,Film & Video,Fantasy,Los Angeles,CA,Los Angeles,12/19/2019,2/14/2020,USD,100000000,85,"$100,000,000",$85,4,canceled
2,Technology,Software,Mexico,Baja California,Tijuana,3/1/2017,3/22/2017,MXN,100000000,10,"$5,219,374",$1,1,failed
3,Publishing,Publishing,Columbus,OH,Franklin,6/4/2018,7/5/2018,USD,100000000,1,"$100,000,000",$1,1,canceled
4,Art,Illustration,Toronto,ON,Toronto,5/1/2015,6/30/2015,CAD,100000000,0,"$80,610,122",$0,0,failed


In [7]:
#let's format and rename the columns so it doesnt look like we are yelling

ks_copy.columns = ks_copy.columns.str.capitalize()

# Some of the column names are too long
new_features = {'Caseid': 'Case_ID', 'Project_page_location_name': 'Project_Country',
                 'Project_page_location_state': 'Project_State', 'Project_page_location_county': 'City',
                 'Launched_date': 'Launched', 'Deadline_date': 'Deadline',
                 'Project_currency': 'Project_Currency', 'Goal_in_original_currency': 'Goal',
                 'Pledged_in_original_currency': 'Pledged', 'Goal_in_usd': 'Goal (USD)',
                 'Pledged_in_usd': 'Pledged (USD)', 'Backers_count': 'Backers (#)',
                 'State': 'Status'}


In [8]:
ks_copy.rename(columns=new_features, inplace=True)

In [9]:
ks_copy.sample(10)

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
410847,Art,Digital Art,Santa Fe,NM,Santa Fe,12/23/2010,1/22/2011,USD,1200,1300,"$1,200","$1,300",30,successful
137268,Art,Public Art,Rockaway Park,NY,Queens,12/30/2014,1/13/2015,USD,15000,16722,"$15,000","$16,722",171,successful
492558,Fashion,Jewelry,Brighton and Hove City,England,East Sussex,6/11/2018,7/11/2018,GBP,200,167,$265,$221,10,failed
121243,Music,Country & Folk,Stamford,CT,Fairfield,5/25/2013,6/24/2013,USD,18000,18522,"$18,000","$18,522",172,successful
40710,Games,Tabletop Games,Sydney,NSW,,3/28/2019,4/30/2019,AUD,51000,63244,"$35,985","$44,625",340,successful
324278,Games,Video Games,Chicago,IL,Cook,6/16/2015,7/16/2015,USD,3000,1082,"$3,000","$1,082",24,failed
448433,Art,Illustration,Warren,MI,Macomb,10/2/2017,10/28/2017,USD,700,831,$700,$831,36,successful
337923,Music,Music,Las Vegas,NV,Clark,6/9/2016,7/9/2016,USD,3000,250,"$3,000",$250,2,failed
135713,Music,Faith,Lakewood,NJ,Ocean,8/1/2014,9/10/2014,USD,15000,3005,"$15,000","$3,005",50,failed
475618,Music,Electronic Music,Cleveland,OH,Cuyahoga,6/12/2016,6/26/2016,USD,400,0,$400,$0,0,failed


In [10]:
# format the numerical columns - Goal, Pledged, Goal (USD), Pledged (USD), Backers (#)

numeric_columns = ['Goal', 'Pledged', 'Goal (USD)', 'Pledged (USD)', 'Backers (#)']


#need to change these functions
def replace_characters(dataframe, columns, characters, replacement, regex):
    for column in columns:
        dataframe[column].replace(characters, replacement, regex=regex, inplace=True)
    return df_head(dataframe)


#change data types
def change_data_type(dataframe, columns, new_type):
   for column in columns:
       dataframe[column] = dataframe[column].astype(int)
   return df_info(dataframe)


In [11]:
replace_characters(ks_copy, numeric_columns, '\$|,|\\s', '', True)

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,Film & Video,Science Fiction,London,England,Greater London,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,Film & Video,Fantasy,Los Angeles,CA,Los Angeles,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,Technology,Software,Mexico,Baja California,Tijuana,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,Publishing,Publishing,Columbus,OH,Franklin,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,Art,Illustration,Toronto,ON,Toronto,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed


In [12]:
#columns where values are spaces...Checked individually prior to

# ks_copy[(ks_copy['Project_Country'] == ' ') | (ks_copy['Project_State'] == ' ') | (ks_copy['City'] == ' ')]

space_columns = ['Project_Country', 'Project_State', 'City']

replace_characters(ks_copy, space_columns, ' ', np.nan, False)

#Figure out how to replace ' ' only not words with spaces... might have to go column by column

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,Film & Video,Science Fiction,London,England,Greater London,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,Film & Video,Fantasy,Los Angeles,CA,Los Angeles,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,Technology,Software,Mexico,Baja California,Tijuana,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,Publishing,Publishing,Columbus,OH,Franklin,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,Art,Illustration,Toronto,ON,Toronto,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed


In [13]:
#check to see if spaces/empty still exist
ks_copy[(ks_copy['Project_Country'] == ' ') | (ks_copy['Project_State'] == ' ') | (ks_copy['City'] == ' ')]

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status


In [14]:
# Backers has some empty strings - replace with 0

replace_characters(ks_copy, ['Backers (#)'], '', np.nan, False)

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,Film & Video,Science Fiction,London,England,Greater London,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,Film & Video,Fantasy,Los Angeles,CA,Los Angeles,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,Technology,Software,Mexico,Baja California,Tijuana,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,Publishing,Publishing,Columbus,OH,Franklin,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,Art,Illustration,Toronto,ON,Toronto,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed


In [15]:
check_nulls(ks_copy)

Here are the number of nulls per column: 
Category                0
Subcategory             0
Project_Country      1873
Project_State        2014
City                64082
Launched                0
Deadline                0
Project_Currency        0
Goal                    0
Pledged                 0
Goal (USD)              0
Pledged (USD)           0
Backers (#)         10148
Status                  0
dtype: int64
Here are the percent of nulls per column: 
Category             0.00
Subcategory          0.00
Project_Country      0.37
Project_State        0.40
City                12.66
Launched             0.00
Deadline             0.00
Project_Currency     0.00
Goal                 0.00
Pledged              0.00
Goal (USD)           0.00
Pledged (USD)        0.00
Backers (#)          2.00
Status               0.00
dtype: float64


Nulls are present in Country, State, City, Backers. City has a null frequency of 12.66, but the others are 2% and lower. Will decide how to proceed later.

In [16]:
#change Status values to title

ks_copy['Status'] = ks_copy['Status'].str.title()

In [17]:
df_head(ks_copy)

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,Film & Video,Science Fiction,London,England,Greater London,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,Canceled
1,Film & Video,Fantasy,Los Angeles,CA,Los Angeles,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,Canceled
2,Technology,Software,Mexico,Baja California,Tijuana,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,Failed
3,Publishing,Publishing,Columbus,OH,Franklin,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,Canceled
4,Art,Illustration,Toronto,ON,Toronto,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,Failed


In [None]:
#remove subcategory, country, state, city, goal, pledged, caseid. calc days live feature.

In [20]:
ks_copy.sample(10)

Unnamed: 0,Category,Subcategory,Project_Country,Project_State,City,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
256600,Music,Indie Rock,Brooklyn,NY,,2/6/2013,2/19/2013,USD,5000,5260,5000,5261,74,Successful
231819,Film & Video,Shorts,Los Angeles,CA,Los Angeles,9/13/2011,10/13/2011,USD,6500,7000,6500,7000,64,Successful
198035,Technology,3D Printing,Brooklyn,NY,,8/19/2014,9/18/2014,USD,9999,13860,9999,13860,258,Successful
349888,Film & Video,Shorts,Philadelphia,PA,Philadelphia,2/4/2011,5/6/2011,USD,2500,130,2500,130,4,Failed
462085,Fashion,Couture,Norfolk,VA,Norfolk City,7/24/2014,8/31/2014,USD,500,0,500,0,0,Failed
440368,Publishing,Nonfiction,London,England,Greater London,7/30/2014,8/29/2014,GBP,800,25,1327,41,1,Failed
11979,Design,Product Design,Hong Kong,Hong Kong Island,,8/14/2018,9/28/2018,HKD,194500,32275,24860,4125,138,Failed
43131,Games,Live Games,Palm Springs,CA,Riverside,4/16/2015,5/16/2015,USD,50000,0,50000,0,0,Failed
9841,Journalism,Web,Miami Beach,FL,Miami-Dade,3/11/2016,4/25/2016,USD,225000,55,225000,55,1,Failed
186436,Design,Product Design,Reading,PA,Berks,1/24/2018,3/10/2018,USD,10000,1,10000,1,1,Failed


In [15]:
#see unique values per feature
unique(ks_copy)
#There are not over 21k countries...

('Here are the number of unique values per columns:',
 Case_ID             506199
 Category                15
 Subcategory            161
 Project_Country      21201
 Project_State         1550
 City                  5412
 Launched              4256
 Deadline              4208
 Project_Currency        15
 Goal                 10725
 Pledged              52112
 Goal (USD)           41992
 Pledged (USD)        49686
 Backers (#)           1001
 Status                   4
 dtype: int64)

Will do a breakdown of Category, Status and Country.

There are apparantlt over 21k unique values for countries when there are way less countries. The Project_State and city are also a complete mess and will take too much time to clean. So, I will drop them. I will also drop the Goal and Pledged for USD as they are redundant.

From the [Kickstarter website](https://help.kickstarter.com/hc/en-us/articles/115005127954-Can-I-choose-my-project-s-display-currency-#:~:text=Kickstarter%20does%20offer%20backers%20the%20option%20to%20set,currency%2C%20not%20in%20a%20backer%E2%80%99s%20preferred%20display%20currency.), the following is given
 > "It’s not possible to choose a preferred display currency for your project. By default, the currency your project will display and collect funds in is determined by the country of origin* you indicate when building your project. For example, if your banking and identity information are based in, say, Hong Kong, your project goal and reward costs will automatically be displayed in HKD. Once your project ends, the funds collected will be sent to you in HKD. If you’re in the UK, they’ll be in GBP, and so on. Kickstarter does offer backers the option to set a preferred display currency when browsing projects. However, pledges will always be collected in the project’s native currency, not in a backer’s preferred display currency. *For projects launching from Denmark, Norway, Sweden, Switzerland, and Poland, creators now have the option to choose whether to run their project in their country’s native currency, or in Euros.*"

So, I will use the currency codes to replace the Project_country, and drop the State, and city.

EUR will become Europe as some places in Europe have a choice

In [21]:
#make dictionary using project currency except EUR - will use cities for EUR

currency_codes = {'USD': 'United States of America', 'EUR': 'Europe', 'GBP': 'United Kingdom', 'CAD': 'Canada', 
                  'AUD': 'Australia', 'MXN': 'Mexico', 'SEK': 'Sweden', 'HKD': 'Hong Kong',
                  'NZD': 'New Zealand', 'DKK': 'Denmark', 'SGD': 'Singapore', 'CHF': 'Switzerland',
                  'NOK': 'Norway', 'JPY': 'Japan', 'PLN': 'Poland'}

In [29]:
#check for dups before setting a working copy
check_dups(ks_copy)

'There are 28 duplicates in your dataframe'

In [35]:
#remove duplicated
ks_copy = drop_dups(ks_copy)

In [36]:
check_dups(ks_copy)

'There are 0 duplicates in your dataframe'

In [44]:
#make new df to make further manipulations

working_copy = ks_copy.copy()

In [45]:
#replace Project_Country values with currency code dictionary

working_copy['Project_Country'] = working_copy['Project_Currency'].map(currency_codes)

In [46]:
#Drop project_state, city, Goal and pledged (USD)
working_copy.drop(['Project_State', 'City', 'Goal (USD)', 'Pledged (USD)'], axis=1, inplace=True)

In [26]:
df_head(working_copy)

Unnamed: 0,Category,Subcategory,Project_Country,Launched,Deadline,Project_Currency,Goal,Pledged,Backers (#),Status
0,Film & Video,Science Fiction,United States of America,8/11/2016,10/10/2016,USD,100000000,0,0,Canceled
1,Film & Video,Fantasy,United States of America,12/19/2019,2/14/2020,USD,100000000,85,4,Canceled
2,Technology,Software,Mexico,3/1/2017,3/22/2017,MXN,100000000,10,1,Failed
3,Publishing,Publishing,United States of America,6/4/2018,7/5/2018,USD,100000000,1,1,Canceled
4,Art,Illustration,Canada,5/1/2015,6/30/2015,CAD,100000000,0,0,Failed


In [47]:
#change Launched and Deadline to datetime

working_copy['Launched'] = pd.to_datetime(working_copy['Launched'])
working_copy['Deadline'] = pd.to_datetime(working_copy['Deadline'])

Unnamed: 0,Category,Subcategory,Project_Country,Launched,Deadline,Project_Currency,Goal,Pledged,Backers (#),Status
0,Film & Video,Science Fiction,United States of America,2016-08-11,2016-10-10,USD,100000000,0,0,Canceled
1,Film & Video,Fantasy,United States of America,2019-12-19,2020-02-14,USD,100000000,85,4,Canceled
2,Technology,Software,Mexico,2017-03-01,2017-03-22,MXN,100000000,10,1,Failed
3,Publishing,Publishing,United States of America,2018-06-04,2018-07-05,USD,100000000,1,1,Canceled
4,Art,Illustration,Canada,2015-05-01,2015-06-30,CAD,100000000,0,0,Failed


In [50]:
df_info(working_copy)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506171 entries, 0 to 506198
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Category          506171 non-null  object        
 1   Subcategory       506171 non-null  object        
 2   Project_Country   506171 non-null  object        
 3   Launched          506171 non-null  datetime64[ns]
 4   Deadline          506171 non-null  datetime64[ns]
 5   Project_Currency  506171 non-null  object        
 6   Goal              506171 non-null  object        
 7   Pledged           506171 non-null  object        
 8   Backers (#)       496023 non-null  object        
 9   Status            506171 non-null  object        
dtypes: datetime64[ns](2), object(8)
memory usage: 42.5+ MB


In [51]:
#calculate the duration in days
working_copy['Duration (days)'] = (working_copy['Deadline'] - working_copy['Launched']).dt.days

Unnamed: 0,Category,Subcategory,Project_Country,Launched,Deadline,Project_Currency,Goal,Pledged,Backers (#),Status,Duration (days)
0,Film & Video,Science Fiction,United States of America,2016-08-11,2016-10-10,USD,100000000,0,0,Canceled,60
1,Film & Video,Fantasy,United States of America,2019-12-19,2020-02-14,USD,100000000,85,4,Canceled,57
2,Technology,Software,Mexico,2017-03-01,2017-03-22,MXN,100000000,10,1,Failed,21
3,Publishing,Publishing,United States of America,2018-06-04,2018-07-05,USD,100000000,1,1,Canceled,31
4,Art,Illustration,Canada,2015-05-01,2015-06-30,CAD,100000000,0,0,Failed,60


Need to start do EDA - explore categories, countries, status