In [2]:
import pandas as pd

In [6]:
# The path to our CSV file
file = 'Resources/KickstarterData.csv'

# Read our Kickstarter data into pandas
df = pd.read_csv(file)
df.head()
df.shape

(4120, 33)

In [8]:
# Get a list of all of our columns for easy reference
df.columns

Index(['id', 'photo', 'name', 'blurb', 'goal', 'pledged', 'state', 'slug',
       'disable_communication', 'country', 'currency', 'currency_symbol',
       'currency_trailing_code', 'deadline', 'state_changed_at', 'created_at',
       'launched_at', 'staff_pick', 'is_starrable', 'backers_count',
       'static_usd_rate', 'usd_pledged', 'creator', 'location', 'category',
       'profile', 'spotlight', 'urls', 'source_url', 'friends', 'is_starred',
       'is_backing', 'permissions'],
      dtype='object')

In [13]:
# Extract "name", "goal", "pledged", "state", "country", "staff_pick",
# "backers_count", and "spotlight"
good_columns = ["name", "goal", "pledged", "state", "country", "staff_pick", "backers_count", "spotlight"]
reduced_kickstarter_df = df.loc[:,good_columns]
reduced_kickstarter_df = df[good_columns]
reduced_kickstarter_df.head()

Unnamed: 0,name,goal,pledged,state,country,staff_pick,backers_count,spotlight
0,The Class Act Players Theatre Company Presents...,1500.0,2925.0,successful,US,False,17,True
1,MR INCREDIBLE by Camilla Whitehill - VAULT Fes...,2500.0,2936.0,successful,GB,True,15,True
2,RUN,1000.0,1200.0,successful,GB,False,30,True
3,9th International Meeting of Youth Theatre sap...,2000.0,2135.0,successful,IT,False,24,True
4,Get Conti to the Ed Fringe!,1000.0,1250.0,successful,GB,False,28,True


In [12]:
reduced_kickstarter_df.columns
#reduced_kickstarter_df.shape

(4120, 8)

In [16]:
# Remove projects that made no money at all
reduced_kickstarter_df = reduced_kickstarter_df.loc[reduced_kickstarter_df['pledged']!=0,:]
reduced_kickstarter_df.head()
reduced_kickstarter_df['pledged'].min()

1.0

In [18]:
reduced_kickstarter_df['country'].value_counts()

US    2129
GB     631
CA     146
AU     114
DE      43
NL      35
ES      25
MX      24
FR      23
IT      21
DK      20
IE      18
SE      16
NZ      12
CH      12
NO      11
BE      10
AT       7
SG       6
HK       3
LU       1
Name: country, dtype: int64

In [38]:
# Collect only those projects that were hosted in the US
# Create a list of the columns
# Create a new df for "US" with the columns above. 
hosted_in_us = reduced_kickstarter_df.loc[reduced_kickstarter_df['country']=='US']
hosted_in_us.head()

Unnamed: 0,name,goal,pledged,state,country,staff_pick,backers_count,spotlight,average_donation
0,The Class Act Players Theatre Company Presents...,1500.0,2925.0,successful,US,False,17,True,172.058824
8,Forefront Festival 2015,7200.0,7230.0,successful,US,False,68,True,106.323529
11,Hamlet the Hip-Hopera,9747.0,10103.0,successful,US,True,132,True,76.537879
14,Pride Con,15000.0,15110.0,successful,US,False,60,True,251.833333
15,En Garde Arts Emerging Artists Festival BOSSS,10000.0,10306.0,successful,US,True,80,True,128.825


In [39]:
# Create a new column that finds the average amount pledged to a project
average_donation = hosted_in_us['pledged'] / hosted_in_us['backers_count']
hosted_in_us['average_donation'] = average_donation
hosted_in_us.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,goal,pledged,state,country,staff_pick,backers_count,spotlight,average_donation
0,The Class Act Players Theatre Company Presents...,1500.0,2925.0,successful,US,False,17,True,172.058824
8,Forefront Festival 2015,7200.0,7230.0,successful,US,False,68,True,106.323529
11,Hamlet the Hip-Hopera,9747.0,10103.0,successful,US,True,132,True,76.537879
14,Pride Con,15000.0,15110.0,successful,US,False,60,True,251.833333
15,En Garde Arts Emerging Artists Festival BOSSS,10000.0,10306.0,successful,US,True,80,True,128.825


In [40]:
# First convert "average_donation", "goal", and "pledged" columns to float
# Then Format to go to two decimal places, include a dollar sign, and use comma notation
#hosted_in_us.columns
#hosted_in_us[['average_donation','goal', 'pledged']].astype("float")
#hosted_in_us['average_donation'] = hosted_in_us['average_donation'].astype('float')
hosted_in_us['average_donation'] = hosted_in_us['average_donation'].map("${:.2f}".format)
hosted_in_us['goal'] = hosted_in_us['goal'].map("${:.2f}".format)
hosted_in_us['pledged'] = hosted_in_us['pledged'].map("${:.2f}".format)
hosted_in_us.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,name,goal,pledged,state,country,staff_pick,backers_count,spotlight,average_donation
0,The Class Act Players Theatre Company Presents...,$1500.00,$2925.00,successful,US,False,17,True,$172.06
8,Forefront Festival 2015,$7200.00,$7230.00,successful,US,False,68,True,$106.32
11,Hamlet the Hip-Hopera,$9747.00,$10103.00,successful,US,True,132,True,$76.54
14,Pride Con,$15000.00,$15110.00,successful,US,False,60,True,$251.83
15,En Garde Arts Emerging Artists Festival BOSSS,$10000.00,$10306.00,successful,US,True,80,True,$128.82


In [41]:
# Calculate the total number of backers for all US projects
hosted_in_us['backers_count'].sum()

89273

In [42]:
# Calculate the average number of backers for all US projects
hosted_in_us['backers_count'].mean()

41.931892907468296

In [46]:
# Collect only those US campaigns that have been picked as a "Staff Pick"
picked_by_staff = hosted_in_us[hosted_in_us['staff_pick']]

In [48]:
# Group by the state of the campaigns and see if staff picks matter (Seems to matter quite a bit)
picked_by_staff.groupby('state')['name'].count()

state
canceled        6
failed         21
live            2
successful    145
Name: name, dtype: int64

In [49]:
picked_by_staff.groupby('state').count()

Unnamed: 0_level_0,name,goal,pledged,country,staff_pick,backers_count,spotlight,average_donation
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
canceled,6,6,6,6,6,6,6,6
failed,21,21,21,21,21,21,21,21
live,2,2,2,2,2,2,2,2
successful,145,145,145,145,145,145,145,145
