In [1]:
# Import library

import pandas as pd 
import numpy as np
import math

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,duration,goal_usd,status,usd_pledged,US_based,comics,crafts,dance,design,fashion,...,3,4,5,6,7,8,9,10,11,12
0,16.0,2000.0,1,6061.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,30.0,3870.99771,1,3914.50512,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,30.0,1100.0,1,1110.0,1,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45.0,3500.0,1,4807.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,60.0,30000.0,1,40368.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# train test split for X, y 
y = df['usd_pledged']
X = df.drop(['usd_pledged', 'status','goal_usd'], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

In [4]:
# dependent variable scaling 
de_scaler = MinMaxScaler(feature_range= (0,1))
x_train = de_scaler.fit_transform (x_train)
x_test = de_scaler.transform(x_test)


## Linear Regression Model 

In [5]:
goal_reg = LinearRegression()
goal_reg.fit(x_train, y_train)
y_pred = goal_reg.predict(x_test)

print (f'Mean Squared Error: {mean_squared_error(y_test,y_pred)}')
print (f'Root Mean Squared Error: {math.sqrt(mean_squared_error(y_test,y_pred))}')
print (f'Variance score: {r2_score(y_test,y_pred)}')


Mean Squared Error: 6582195326.697448
Root Mean Squared Error: 81130.72985433725
Variance score: 0.017198281012048122


## Decision Tree Regression 

In [6]:
from sklearn.tree import DecisionTreeRegressor 
tree_reg = DecisionTreeRegressor(random_state=0)
tree_reg.fit(x_train,y_train)
y_tree_pred = tree_reg.predict(x_test)
print (f'Mean Squared Error: {mean_squared_error(y_test,y_tree_pred)}')
print (f'Root Mean Squared Error: {math.sqrt(mean_squared_error(y_test,y_tree_pred))}')
print (f'Variance score: {r2_score(y_test,y_tree_pred)}')


Mean Squared Error: 7412540151.925402
Root Mean Squared Error: 86096.10996976228
Variance score: -0.10678228794445466


In [7]:
tree_reg.get_depth()

38

In [8]:
# hyper parameter tuning
from sklearn.model_selection import GridSearchCV
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

In [9]:
param = {'max_depth': list(range(1,30))} 
gridsearch (tree_reg, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}
Best Score : 0.02784207275533266
Best parameters: {'max_depth': 5}


GridSearchCV(estimator=DecisionTreeRegressor(random_state=0), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29]})

In [10]:
## Fine tuning
tree_reg2 = DecisionTreeRegressor(max_depth=5)
tree_reg2.fit(x_train,y_train)
y_tree_pred = tree_reg2.predict(x_test)
print (f'Mean Squared Error: {mean_squared_error(y_test,y_tree_pred)}')
print (f'Root Mean Squared Error: {math.sqrt(mean_squared_error(y_test,y_tree_pred))}')
print (f'Variance score: {r2_score(y_test,y_tree_pred)}')

Mean Squared Error: 6507032931.506234
Root Mean Squared Error: 80666.1820808834
Variance score: 0.02842093964351733


## AdaBoostRegression

In [11]:
from sklearn.ensemble import AdaBoostRegressor

In [12]:
ada_regr = AdaBoostRegressor(random_state=0)
ada_regr.fit(x_train,y_train)
y_ada_pred = ada_regr.predict(x_test)

print (f'Mean Squared Error: {mean_squared_error(y_test,y_ada_pred)}')
print (f'Root Mean Squared Error: {math.sqrt(mean_squared_error(y_test,y_ada_pred))}')
print (f'Variance score: {r2_score(y_test,y_ada_pred)}')

Mean Squared Error: 37106306247.524605
Root Mean Squared Error: 192629.97234990355
Variance score: -4.5404222687596985
