# Data Load

In [1]:
import pandas as pd
RawData = pd.read_csv("FundingProject.csv")
RawData.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1015685046,Organic Tattoo remains 10 days (Self use & cus...,Accessories,Fashion,CAD,1/1/2016,6000,12/7/2015 18:21,100.0,failed,2,CA,74.84,70.89,4253.51
1,1019043170,Handcrafted leather wallet | key holder | card...,Accessories,Fashion,EUR,1/1/2016,2000,12/11/2015 10:52,2102.0,successful,48,BE,2314.01,2282.97,2172.19
2,1036288991,The Liaisons: a new twist on the old standards,Jazz,Music,USD,1/1/2016,5000,11/9/2015 20:12,5630.0,successful,116,US,5630.0,5630.0,5000.0
3,1045749249,The Many Encounters of Bosley Bear,Children's Books,Publishing,USD,1/1/2016,20000,11/24/2015 2:08,101.0,failed,2,US,101.0,101.0,20000.0
4,1048577059,Swift & Co | Innovative Men's Footwear,Footwear,Fashion,GBP,1/1/2016,40000,11/17/2015 12:30,2246.0,failed,19,GB,3416.59,3273.48,58298.84


In [2]:
RawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110815 entries, 0 to 110814
Data columns (total 15 columns):
ID                  110815 non-null int64
name                110814 non-null object
category            110815 non-null object
main_category       110815 non-null object
currency            110815 non-null object
deadline            110815 non-null object
goal                110815 non-null int64
launched            110815 non-null object
pledged             110815 non-null float64
state               110815 non-null object
backers             110815 non-null int64
country             110815 non-null object
usd pledged         109999 non-null float64
usd_pledged_real    110815 non-null float64
usd_goal_real       110815 non-null float64
dtypes: float64(4), int64(3), object(8)
memory usage: 12.7+ MB


# Process Data

In [3]:
import pandas as pd
ProcessedData = pd.DataFrame()

### Project status 

In [4]:
RawData["state"].describe()

count     110815
unique         6
top       failed
freq       58155
Name: state, dtype: object

In [5]:
RawData["state"].value_counts()

failed        58155
successful    38172
canceled      13178
suspended       707
undefined       596
live              7
Name: state, dtype: int64

In [6]:
# Remap status labels to numerical values, bias starting from 2 for failures, scale to range of 1
# At the same time, drop rows with unrelated state values or values which cannot be processed
ProcessedData["Status"] = RawData.loc[RawData["state"].isin(["failed", "successful", "canceled", "suspended"]), "state"].map({
    "canceled" : 2,
    "failed" : 2,
    "suspended" : 2,
    "successful": 3,
})

In [7]:
ProcessedData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110212 entries, 0 to 110814
Data columns (total 1 columns):
Status    110212 non-null int64
dtypes: int64(1)
memory usage: 1.7 MB


In [8]:
ProcessedData["Status"].value_counts()

2    72040
3    38172
Name: Status, dtype: int64

In [9]:
RawData.isnull().sum()

ID                    0
name                  1
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged         816
usd_pledged_real      0
usd_goal_real         0
dtype: int64

# Modeling

In [10]:
RawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110815 entries, 0 to 110814
Data columns (total 15 columns):
ID                  110815 non-null int64
name                110814 non-null object
category            110815 non-null object
main_category       110815 non-null object
currency            110815 non-null object
deadline            110815 non-null object
goal                110815 non-null int64
launched            110815 non-null object
pledged             110815 non-null float64
state               110815 non-null object
backers             110815 non-null int64
country             110815 non-null object
usd pledged         109999 non-null float64
usd_pledged_real    110815 non-null float64
usd_goal_real       110815 non-null float64
dtypes: float64(4), int64(3), object(8)
memory usage: 12.7+ MB


In [11]:
ProcessedData = pd.concat([ProcessedData["Status"], RawData.drop(["state", "name"], axis = 1)], axis = 1)

In [12]:
ProcessedData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110815 entries, 0 to 110814
Data columns (total 14 columns):
Status              110212 non-null float64
ID                  110815 non-null int64
category            110815 non-null object
main_category       110815 non-null object
currency            110815 non-null object
deadline            110815 non-null object
goal                110815 non-null int64
launched            110815 non-null object
pledged             110815 non-null float64
backers             110815 non-null int64
country             110815 non-null object
usd pledged         109999 non-null float64
usd_pledged_real    110815 non-null float64
usd_goal_real       110815 non-null float64
dtypes: float64(5), int64(3), object(6)
memory usage: 12.7+ MB


In [13]:
ProcessedData.dropna(subset = ["Status", "usd pledged"], inplace = True)
ProcessedData.drop_duplicates(["ID"])
ProcessedData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109993 entries, 0 to 110814
Data columns (total 14 columns):
Status              109993 non-null float64
ID                  109993 non-null int64
category            109993 non-null object
main_category       109993 non-null object
currency            109993 non-null object
deadline            109993 non-null object
goal                109993 non-null int64
launched            109993 non-null object
pledged             109993 non-null float64
backers             109993 non-null int64
country             109993 non-null object
usd pledged         109993 non-null float64
usd_pledged_real    109993 non-null float64
usd_goal_real       109993 non-null float64
dtypes: float64(5), int64(3), object(6)
memory usage: 12.6+ MB


In [27]:
ProcessedData["Status"].value_counts()

2.0    71912
3.0    38081
Name: Status, dtype: int64

In [29]:
for column in ProcessedData.columns.to_list():
    print(column, ":", ProcessedData[column].duplicated().sum(), " - " , ProcessedData[column].isnull().sum())

Status : 109991  -  0
ID : 0  -  0
category : 109834  -  0
main_category : 109978  -  0
currency : 109979  -  0
deadline : 109262  -  0
goal : 106352  -  0
launched : 8267  -  0
pledged : 83476  -  0
backers : 107364  -  0
country : 109971  -  0
usd pledged : 74632  -  0
usd_pledged_real : 62405  -  0
usd_goal_real : 85920  -  0


In [14]:
from datetime import datetime
print(datetime.now())

2020-02-05 12:33:17.294210


## Generate baseline model without using title field

In [65]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

def AssessRegressorModels(X_Train, Y_Train, X_Test, Y_Test, Models_List):
    from datetime import datetime
    import pandas as pd
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    
    results_list = pd.DataFrame()
    
    for model in Models_List:
        modelInstance = model()
        startTime = datetime.now()
        print("Starting " + str(model.__name__) + " at " + str(startTime))
        
        modelInstance.fit(X_Train, Y_Train)
        predictions = modelInstance.predict(X_Test)
        
        endTime = datetime.now()
        print("Ending " + str(model.__name__) + " at " + str(endTime))
        results_list = results_list.append(pd.DataFrame(data = {
            "Name" : model.__name__,
            "R2Score" : r2_score(Y_Test, predictions),
            "RMSE" : mean_squared_error(Y_Test, predictions)**0.5,
            "MAE" : mean_absolute_error(Y_Test, predictions),
            "RunTime" : str(endTime - startTime),
            "ModelObject" : [modelInstance],
        }), ignore_index = True)
    
    results_list.sort_values("R2Score", inplace = True, ascending = False)
    
    return results_list

In [66]:
TrainingData = pd.get_dummies(ProcessedData.loc[:10000, ["Status", "category", "main_category", "currency", "deadline", "goal", "launched", "country"]], drop_first = True)

In [67]:
print(datetime.now())

2020-02-05 13:35:18.182751


In [68]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9668 entries, 0 to 10000
Columns: 9394 entries, Status to country_US
dtypes: float64(1), int64(1), uint8(9392)
memory usage: 86.8 MB


In [69]:
print(datetime.now())

2020-02-05 13:35:18.754567


In [70]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(TrainingData.drop("Status", axis = 1), TrainingData["Status"], test_size = 0.2, random_state = 13)

In [71]:
print(datetime.now())

2020-02-05 13:35:19.075465


In [72]:
RegressorResults_List = AssessRegressorModels(Train_X, Train_Y, Test_X, Test_Y, [LinearRegression, Ridge, Lasso, GradientBoostingRegressor, XGBRegressor])

Starting LinearRegression at 2020-02-05 13:35:19.081463
Ending LinearRegression at 2020-02-05 13:37:07.816674
Starting Ridge at 2020-02-05 13:37:07.820673


  overwrite_a=False)


Ending Ridge at 2020-02-05 13:37:19.251015
Starting Lasso at 2020-02-05 13:37:19.255014
Ending Lasso at 2020-02-05 13:37:20.991460
Starting GradientBoostingRegressor at 2020-02-05 13:37:20.995458
Ending GradientBoostingRegressor at 2020-02-05 13:38:21.562080
Starting XGBRegressor at 2020-02-05 13:38:21.565079


  if getattr(data, 'base', None) is not None and \


Ending XGBRegressor at 2020-02-05 13:39:44.016699


In [73]:
with pd.option_context("display.max_columns", None, "display.max_rows", None, "display.max_colwidth", 40):
    display(RegressorResults_List)

Unnamed: 0,Name,R2Score,RMSE,MAE,RunTime,ModelObject
4,XGBRegressor,0.157121,0.408769,0.346258,0:01:22.451620,"XGBRegressor(base_score=0.5, booster..."
3,GradientBoostingRegressor,0.155916,0.409062,0.347093,0:01:00.566622,([DecisionTreeRegressor(ccp_alpha=0....
1,Ridge,0.090972,0.424507,0.355962,0:00:11.430342,"Ridge(alpha=1.0, copy_X=True, fit_in..."
2,Lasso,0.00045,0.445142,0.40018,0:00:01.736446,"Lasso(alpha=1.0, copy_X=True, fit_in..."
0,LinearRegression,-2.73117,0.86004,0.700911,0:01:48.735211,"LinearRegression(copy_X=True, fit_in..."


In [74]:
print(datetime.now())

2020-02-05 13:39:44.103671


In [75]:
pd.DataFrame(data = {
    "Names" : TrainingData.drop("Status", axis = 1).columns.to_list(),
    "Weight": (RegressorResults_List.loc[RegressorResults_List["Name"] == "XGBRegressor", "ModelObject"]).iloc[0].feature_importances_,
}).sort_values(by="Weight", ascending = False)

Unnamed: 0,Names,Weight
0,goal,0.035390
7,category_Apparel,0.030064
133,category_Tabletop Games,0.027678
165,main_category_Music,0.027347
126,category_Shorts,0.023485
...,...,...
3204,launched_1/31/2016 20:06,0.000000
3205,launched_1/31/2016 20:13,0.000000
3206,launched_1/31/2016 20:17,0.000000
3207,launched_1/31/2016 20:24,0.000000


## Generate model using title field

### Tokenize words in title

### Determine parts of speech for each word

### Calculate readability / complexity / length of text
How likely title is glossed over
Flesch Kincaid?

### Calculate sentiment of each title
How positive or optimistic titles feel

### Calculate average TD-IDF of words in each title
How often are similar campaigns started.
Basically reader fatigue.