In [1]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# My Functions

In [14]:
#get portion of data
def df_sample(dataframe, num_rows):
    if not dataframe.head().empty:
        return dataframe.sample(num_rows)
    else:
        return "Your dataframe does not exist! Check your dataframe variable."

#get information
def df_info(dataframe):
    if not dataframe.empty:
        return dataframe.info() 
    else:
        return "There is an issue with your dataframe! Check your dataframe variable."

#get shape
def df_shape(dataframe):
    if dataframe.shape == (0,0):
        return "Your dataframe does not exist! Check your dataframe variable."
    else:
        return dataframe.shape

#decribe dataframe
def describe_df(dataframe, categorical=False):
    if not dataframe.empty:
        if categorical == False:
            return dataframe.describe()
        else:
            return dataframe.describe(include='O')
    else:
        return "Your dataframe does not exist! Check your dataframe variable."


#get columns
def get_columns(dataframe):
    list_length = len(dataframe.columns)
    if list_length >= 0:
        return dataframe.columns
    else:
        return "There are no columns in your dataframe! Check your dataframe variable."

#get column data
def column_data(dataframe, column_name: str):
    if column_name in get_columns(dataframe):
        return dataframe[column_name]
    else:
        return "Your dataframe or column does not exist! Check your dataframe variable and columns."

#check nulls
def check_nulls(dataframe):
    num_nulls = dataframe.isnull().sum()
    percent_nulls = (num_nulls / len(dataframe) * 100).round(2)
    if not dataframe.empty:
        if num_nulls.sum() > 0:
            print(f"Here are the number of nulls per column: \n{num_nulls}")
            print(f"Here are the percent of nulls per column: \n{percent_nulls}")
        else:
            return "There are no nulls in your data."
    else:
        return "Your dataframe does not exist! Check your dataframe variable."

#check value counts
def unique(dataframe):
    if not dataframe.empty:
        return f"Here are the number of unique values per columns:", dataframe.nunique()


#check for dups
def check_dups(dataframe):
    num_dups = dataframe.duplicated().sum()
    if not dataframe.empty:
        return f"There are {num_dups} duplicates in your dataframe"
    else:
        return "Your dataframe does not exist! Check your dataframe variable."
    
#drop dups
def drop_dups(dataframe):
    dups = check_dups(dataframe)
    if dups == 0:
        return dataframe
    else:
        return dataframe.drop_duplicates()


#Visuals 

def make_hist(dataframe, columns, num_rows, num_columns, fig_size):
    if not dataframe.empty:
       fig, axs = plt.subplots(num_rows, num_columns, figsize=fig_size)
       for i in range(num_rows):
           for j in range(num_columns):
               axs[i,j].hist(dataframe[columns[i * num_columns + j]])
               axs[i,j].set_title(columns[i * num_columns + j])
    else:
        return "Your dataframe is empty. Check your dataframe variable"

#make boxplots
def make_boxplot(dataframe, columns, num_rows, num_columns, fig_size):
    if not dataframe.empty:
       fig, axs = plt.subplots(num_rows, num_columns, figsize=fig_size)
       for i in range(num_rows):
           for j in range(num_columns):
               axs[i,j].boxplot(dataframe[columns[i * num_columns + j]])
               axs[i,j].set_title(columns[i * num_columns + j])
    else:
        return "Your dataframe is empty. Check your dataframe variable"

#make pairplot
def make_pairplot(dataframe):
    return sns.pairplot(dataframe)

#make heatmap
def make_heatmap(dataframe):
    correlation = dataframe.corr(numeric_only=True)
    sns.heatmap(correlation, cmap='coolwarm', annot=True)
    return plt.show()

#make scatter plot
def make_scatter(dataframe, x_column, y_column, title: str, x_label: str, y_label: str):
    plt.scatter(x=dataframe[x_column], y=dataframe[y_column])
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    return plt.show()

# Data Formatting

In [2]:
#load data and make copy

raw_data = pd.read_csv('38050-0001-Data.tsv', sep='\t', dtype=str, encoding_errors='ignore')

data = raw_data.copy()

In [4]:
#get samples to see format
df_sample(data, 10)

Unnamed: 0,CASEID,NAME,PID,CATEGORY,CATEGORY_ID,SUBCATEGORY,SUBCATEGORY_ID,PROJECT_PAGE_LOCATION_NAME,PROJECT_PAGE_LOCATION_STATE,PROJECT_PAGE_LOCATION_COUNTY,...,LAUNCHED_DATE,DEADLINE_DATE,PROJECT_CURRENCY,GOAL_IN_ORIGINAL_CURRENCY,PLEDGED_IN_ORIGINAL_CURRENCY,GOAL_IN_USD,PLEDGED_IN_USD,BACKERS_COUNT,STATE,URL_NAME
356890,356891,MASKED BY ICPSR,1975112714,Music,14,Electronic Music,38,Wales,England,South Yorkshire,...,10/30/2017,11/29/2017,GBP,2500,118,"$3,292",$155,2.0,canceled,MASKED BY ICPSR
483746,483747,MASKED BY ICPSR,572009476,Music,14,Classical Music,36,Brooklyn,NY,,...,4/30/2018,6/29/2018,USD,300,1,$300,$1,1.0,failed,MASKED BY ICPSR
59495,59496,MASKED BY ICPSR,675088540,Design,7,Product Design,28,Boulder,CO,Boulder,...,11/18/2013,12/19/2013,USD,40000,112906,"$40,000","$112,906",,successful,MASKED BY ICPSR
282354,282355,MASKED BY ICPSR,737674839,Design,7,Product Design,28,Altamonte Springs,FL,Seminole,...,8/15/2013,9/19/2013,USD,5000,6318,"$5,000","$6,318",392.0,successful,MASKED BY ICPSR
130069,130070,MASKED BY ICPSR,1852867844,Music,14,Music,14,Inlet,NY,Hamilton,...,11/4/2011,1/3/2012,USD,15000,195,"$15,000",$195,8.0,failed,MASKED BY ICPSR
172869,172870,MASKED BY ICPSR,1550905956,Publishing,18,Fiction,47,Gainesville,FL,Alachua,...,4/30/2012,5/31/2012,USD,10000,10,"$10,000",$10,1.0,failed,MASKED BY ICPSR
146299,146300,MASKED BY ICPSR,80626055,Music,14,Music,14,Brooklyn,NY,,...,10/15/2013,11/14/2013,USD,14999,15567,"$14,999","$15,567",136.0,successful,MASKED BY ICPSR
249099,249100,MASKED BY ICPSR,2125698556,Games,12,Video Games,35,San Bernardino,CA,San Bernardino,...,6/12/2012,7/17/2012,USD,5500,526,"$5,500",$526,25.0,failed,MASKED BY ICPSR
322686,322687,MASKED BY ICPSR,766951301,Photography,15,Places,279,Glasgow,Scotland,Strathclyde,...,4/28/2014,5/28/2014,GBP,3000,70,"$5,043",$118,1.0,failed,MASKED BY ICPSR
286424,286425,MASKED BY ICPSR,638619314,Publishing,18,Children's Books,46,Shenzhen,Guangdong,Shenzhen,...,9/1/2016,10/1/2016,USD,5000,704,"$5,000",$704,22.0,failed,MASKED BY ICPSR


In [3]:
# drop Name and URL name

data.drop(['NAME', 'URL_NAME'], axis=1, inplace=True)

In [4]:
#reformat feature names
#let's format and rename the columns so it doesnt look like we are yelling

data.columns = data.columns.str.capitalize()

# Some of the column names are too long
new_features = {'Caseid': 'Case_ID', 'Project_page_location_name': 'Project_Country',
                 'Project_page_location_state': 'Project_State', 'Project_page_location_county': 'City',
                 'Launched_date': 'Launched', 'Deadline_date': 'Deadline',
                 'Project_currency': 'Project_Currency', 'Goal_in_original_currency': 'Goal',
                 'Pledged_in_original_currency': 'Pledged', 'Goal_in_usd': 'Goal (USD)',
                 'Pledged_in_usd': 'Pledged (USD)', 'Backers_count': 'Backers (#)',
                 'State': 'Status'}

#change column names
data.rename(columns=new_features, inplace=True)

In [5]:
#check for empty strings first given its a tsv
empty_strings = data.applymap(lambda x: x == ' ')
count_empty = empty_strings.sum()
print(count_empty)

Case_ID                 0
Pid                     0
Category                0
Category_id             0
Subcategory             0
Subcategory_id          0
Project_Country      1873
Project_State        2014
City                64082
Uid                     0
Launched                0
Deadline                0
Project_Currency        0
Goal                    0
Pledged                 0
Goal (USD)              0
Pledged (USD)           0
Backers (#)         10148
Status                  0
dtype: int64


Empty strings in Project_Country, Project_State, City, and Backers (#)
- Can use Project_Currency to fill in empty countries
- drop state and city; too cumbersome to fill in
- Backers (#) will likely use median imputation

In [6]:
numeric_columns = ['Goal', 'Pledged', 'Goal (USD)', 'Pledged (USD)', 'Backers (#)']


#replace characters
def replace_characters(dataframe, columns, characters, replacement, regex):
    for column in columns:
        dataframe[column].replace(characters, replacement, regex=regex, inplace=True)
    return dataframe


#change data types
def change_data_type(dataframe, columns, new_type):
   for column in columns:
       dataframe[column] = dataframe[column].astype(new_type)
   return df_info(dataframe)


In [7]:
#replace some regular expressions in numeric columns with empty string
replace_characters(data, numeric_columns, '\$|,|\\s', '', True)

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Project_State,City,Uid,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,1,2137925650,Film & Video,11,Science Fiction,301,London,England,Greater London,1076478145,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,2,1501531085,Film & Video,11,Fantasy,296,Los Angeles,CA,Los Angeles,224946798,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,3,953415668,Technology,16,Software,51,Mexico,Baja California,Tijuana,1772203542,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,4,1371386304,Publishing,18,Publishing,18,Columbus,OH,Franklin,1373465389,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,5,1720842777,Art,1,Illustration,22,Toronto,ON,Toronto,1455666383,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506194,506195,932773640,Publishing,18,Children's Books,46,Palo Alto,CA,Santa Clara,1484349690,8/7/2014,9/6/2014,USD,1,0,1,0,0,failed
506195,506196,620302213,Art,1,Conceptual Art,20,Detroit,MI,Wayne,211945026,11/25/2009,12/4/2009,USD,0,100,0,100,6,successful
506196,506197,688564643,Publishing,18,Fiction,47,Lyme,NH,Grafton,388384107,11/7/2011,12/13/2011,USD,0,0,0,0,0,canceled
506197,506198,9572984,Film & Video,11,Shorts,32,New York,NY,,1600537964,1/25/2012,3/16/2012,USD,0,0,0,0,0,failed


In [8]:
#replace '' in numeric columns with np.nan
replace_characters(data, numeric_columns, '', np.nan, True)

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Project_State,City,Uid,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,1,2137925650,Film & Video,11,Science Fiction,301,London,England,Greater London,1076478145,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,2,1501531085,Film & Video,11,Fantasy,296,Los Angeles,CA,Los Angeles,224946798,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,3,953415668,Technology,16,Software,51,Mexico,Baja California,Tijuana,1772203542,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,4,1371386304,Publishing,18,Publishing,18,Columbus,OH,Franklin,1373465389,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,5,1720842777,Art,1,Illustration,22,Toronto,ON,Toronto,1455666383,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506194,506195,932773640,Publishing,18,Children's Books,46,Palo Alto,CA,Santa Clara,1484349690,8/7/2014,9/6/2014,USD,1,0,1,0,0,failed
506195,506196,620302213,Art,1,Conceptual Art,20,Detroit,MI,Wayne,211945026,11/25/2009,12/4/2009,USD,0,100,0,100,6,successful
506196,506197,688564643,Publishing,18,Fiction,47,Lyme,NH,Grafton,388384107,11/7/2011,12/13/2011,USD,0,0,0,0,0,canceled
506197,506198,9572984,Film & Video,11,Shorts,32,New York,NY,,1600537964,1/25/2012,3/16/2012,USD,0,0,0,0,0,failed


In [9]:
#replace all values where empty string with np.NaN
replace_characters(data, data.columns, ' ', np.nan, False)

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Project_State,City,Uid,Launched,Deadline,Project_Currency,Goal,Pledged,Goal (USD),Pledged (USD),Backers (#),Status
0,1,2137925650,Film & Video,11,Science Fiction,301,London,England,Greater London,1076478145,8/11/2016,10/10/2016,USD,100000000,0,100000000,0,0,canceled
1,2,1501531085,Film & Video,11,Fantasy,296,Los Angeles,CA,Los Angeles,224946798,12/19/2019,2/14/2020,USD,100000000,85,100000000,85,4,canceled
2,3,953415668,Technology,16,Software,51,Mexico,Baja California,Tijuana,1772203542,3/1/2017,3/22/2017,MXN,100000000,10,5219374,1,1,failed
3,4,1371386304,Publishing,18,Publishing,18,Columbus,OH,Franklin,1373465389,6/4/2018,7/5/2018,USD,100000000,1,100000000,1,1,canceled
4,5,1720842777,Art,1,Illustration,22,Toronto,ON,Toronto,1455666383,5/1/2015,6/30/2015,CAD,100000000,0,80610122,0,0,failed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506194,506195,932773640,Publishing,18,Children's Books,46,Palo Alto,CA,Santa Clara,1484349690,8/7/2014,9/6/2014,USD,1,0,1,0,0,failed
506195,506196,620302213,Art,1,Conceptual Art,20,Detroit,MI,Wayne,211945026,11/25/2009,12/4/2009,USD,0,100,0,100,6,successful
506196,506197,688564643,Publishing,18,Fiction,47,Lyme,NH,Grafton,388384107,11/7/2011,12/13/2011,USD,0,0,0,0,0,canceled
506197,506198,9572984,Film & Video,11,Shorts,32,New York,NY,,1600537964,1/25/2012,3/16/2012,USD,0,0,0,0,0,failed


In [10]:
#change Status values to title case

data['Status'] = data['Status'].str.title()

In [13]:
# see if above worked
check_nulls(data)

Here are the number of nulls per column: 
Case_ID                 0
Pid                     0
Category                0
Category_id             0
Subcategory             0
Subcategory_id          0
Project_Country      1873
Project_State        2014
City                64082
Uid                     0
Launched                0
Deadline                0
Project_Currency        0
Goal                    0
Pledged                 0
Goal (USD)              0
Pledged (USD)           0
Backers (#)         10148
Status                  0
dtype: int64
Here are the percent of nulls per column: 
Case_ID              0.00
Pid                  0.00
Category             0.00
Category_id          0.00
Subcategory          0.00
Subcategory_id       0.00
Project_Country      0.37
Project_State        0.40
City                12.66
Uid                  0.00
Launched             0.00
Deadline             0.00
Project_Currency     0.00
Goal                 0.00
Pledged              0.00
Goal (USD)        

Plan:
- use Project_Currency to fill in Project_Country
- drop Project_State and _City

In [11]:
#make dictionary using project currency

currency_codes = {'USD': 'United States of America', 'EUR': 'European Union', 'GBP': 'United Kingdom', 'CAD': 'Canada', 
                  'AUD': 'Australia', 'MXN': 'Mexico', 'SEK': 'Sweden', 'HKD': 'Hong Kong',
                  'NZD': 'New Zealand', 'DKK': 'Denmark', 'SGD': 'Singapore', 'CHF': 'Switzerland',
                  'NOK': 'Norway', 'JPY': 'Japan', 'PLN': 'Poland'}

#replace Project_Country values with currency code dictionary
data['Project_Country'] = data['Project_Currency'].map(currency_codes)

#Drop project_state, city, Goal and pledged - these are redundant features and focused on USD numbers
data.drop(['Project_State', 'City', 'Goal', 'Pledged'], axis=1, inplace=True)

In [12]:
#change Launched and Deadline to datetime

data['Launched'] = pd.to_datetime(data['Launched'])
data['Deadline'] = pd.to_datetime(data['Deadline'])

In [15]:
#some more data cleaning

#fill nan with 0 in Backers (#)
data['Backers (#)'].fillna(0, inplace=True)

#change data type of Goal, Pledged, Backers (#) to int
change_data_type(data, ['Goal (USD)', 'Pledged (USD)', 'Backers (#)'], int)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506199 entries, 0 to 506198
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Case_ID           506199 non-null  object        
 1   Pid               506199 non-null  object        
 2   Category          506199 non-null  object        
 3   Category_id       506199 non-null  object        
 4   Subcategory       506199 non-null  object        
 5   Subcategory_id    506199 non-null  object        
 6   Project_Country   506199 non-null  object        
 7   Uid               506199 non-null  object        
 8   Launched          506199 non-null  datetime64[ns]
 9   Deadline          506199 non-null  datetime64[ns]
 10  Project_Currency  506199 non-null  object        
 11  Goal (USD)        506199 non-null  int64         
 12  Pledged (USD)     506199 non-null  int64         
 13  Backers (#)       506199 non-null  int64         
 14  Stat

In [16]:
#check for successful campaigns that have 0 backers - 9890 instances will replace these with medians
success_no_backers = (data['Backers (#)'] == 0) & (data['Status'] == 'Successful')
success_no_backers.sum()

9890

In [17]:
#get median values for successful campaigns
median_backers_success = data.loc[(data['Status'] == 'Successful') & (data['Backers (#)'] != 0), 'Backers (#)'].median()

# Replace the values that are 0 with the median value
data.loc[success_no_backers, 'Backers (#)'] = median_backers_success

In [18]:
#check to see if above worked - 
data[(data['Backers (#)'] == 0) & (data['Status'] == 'Successful')]

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status


Need to make sure classifications make sense as well. any campaigns with Pledged >= Goal should be Successful

In [19]:
#check if any of the numeric columns have values under 0

under_0 =  data[(data['Goal (USD)'] < 0) | (data['Pledged (USD)'] < 0) | (data['Backers (#)'] < 0)]

under_0

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status
14428,14429,1317300700,Design,7,Product Design,28,United States of America,924758631,2014-11-19,2015-01-01,USD,150000,0,-2,Failed
117146,117147,1474098850,Design,7,Product Design,28,United States of America,1521927591,2014-12-09,2015-01-13,USD,20000,0,-2,Failed


In [20]:
#replace negative in Backers(#) - they were classified as Failed, so 0 makes sense.
data.loc[under_0.index, 'Backers (#)'] = 0

In [21]:
#check above
data[(data['Goal (USD)'] < 0) | (data['Pledged (USD)'] < 0) | (data['Backers (#)'] < 0)]

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status


In [22]:
#check for Successful status where Pledged < Goal - These should be Failures 6 instances
should_be_fails = data[(data['Status'] == 'Successful') & (data['Pledged (USD)'] < data['Goal (USD)'])]
should_be_fails

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status
63410,63411,1769772834,Film & Video,11,Shorts,32,United States of America,467587104,2015-11-12,2015-12-27,USD,36000,30716,275,Successful
96929,96930,780863434,Design,7,Product Design,28,European Union,1859408531,2016-02-08,2016-03-19,EUR,28174,28053,90,Successful
506024,506025,715247891,Comics,3,Comics,3,European Union,343048188,2020-03-12,2020-03-13,EUR,1,0,70,Successful


In [23]:
# Replace the values that match the condition with 'Failed'
data.loc[data['Pledged (USD)'] < data['Goal (USD)'], 'Status'] = 'Failed'

In [24]:
#check above
data[(data['Status'] == 'Successful') & (data['Pledged (USD)'] < data['Goal (USD)'])]

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status


In [25]:
#check for Failed Campaigns where pledged > Goal - These should be successful - 2065 instances
should_be_success = (data['Status'] == 'Failed') & (data['Pledged (USD)'] > data['Goal (USD)'])

should_be_success.sum()

14

In [26]:
#replace above values 
data.loc[should_be_success, 'Status'] = 'Successful'

In [27]:
#check above
data[(data['Status'] == 'Failed') & (data['Pledged (USD)'] > data['Goal (USD)'])]

Unnamed: 0,Case_ID,Pid,Category,Category_id,Subcategory,Subcategory_id,Project_Country,Uid,Launched,Deadline,Project_Currency,Goal (USD),Pledged (USD),Backers (#),Status


In [27]:
#save to file for further work
data.to_json('raw_clean_data.json', orient='records')