## Regression Model

We can technically stop at our classification task, as our imaginary client might be satisfied in knowing whether their campaign will be successful or not given the required inputs. However, what can we do more? Perhaps, we can help them to estimate how much are they likely to raise with the given variables. We can attempt to predict the probability of whether they are likely to success and use this as an input to finally output how much are they likely to raise as a result. To do this, first, we need to find the right model to predict the probability. Next, we will need to find out whether there is a correlation between this probability and how much they are likely to raise.

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
print("done")

In [None]:
# Data Processing
df = pd.read_csv("variables.csv")
df = df.drop(['Unnamed: 0'],axis=1)
df_x = df[['Total_Days','currency','category','month_launch']]
df_y = df[['amount_raised_usd_boxcox','state']]
df_x_onehot = df_x.copy()

for c in df_x_onehot.columns:
    df_x_onehot[c] = df_x_onehot[c].astype('object') # changing to object type for onehot processsing
    
df_x_onehot = pd.get_dummies(df_x_onehot)
df_x_noText = df_x_onehot.copy()
df_x_Text = df_x_onehot.copy()
text_df = pd.read_csv("text_results_cleaned.csv")
df_x_Text['compound_tag_vader'] = text_df['compound_tag_vader']
df_x_Text['pos_tag_disBert'] = text_df['pos_tag_disBert']
df_x_Text['compound_ti_vader'] = text_df['compound_ti_vader']
df_x_Text['pos_ti_disBert'] = text_df['pos_ti_disBert']

In [None]:
# splitting data
x_train_noText, x_test_noText, y_train_noText, y_test_noText = train_test_split(df_x_noText, df_y,
                                                                             test_size=0.2, stratify=df_y['state'],
                                                                             random_state=42)
x_train_Text, x_test_Text, y_train_Text, y_test_Text = train_test_split(df_x_Text, df_y,
                                                                             test_size=0.2, stratify=df_y['state'],
                                                                             random_state=42)

In [None]:
# fitting into logistic regression model (no Text)
from sklearn.linear_model import LogisticRegression

log_clf_noText = LogisticRegression()
log_clf_noText.fit(x_train_noText, y_train_noText['state'])

In [None]:
# compare to calibration curve
from sklearn.calibration import calibration_curve
y_train_predict_proba_logNoText = log_clf_noText.predict_proba(x_train_noText)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(y_train_noText['state'], y_train_predict_proba_logNoText, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, 's-',label='model performance')
plt.plot([0, 1], [0, 1], '--', color='gray',label='Perfectly Caliberated')
plt.xlabel("mean predicted value")
plt.ylabel("fraction of positives")
plt.legend()
plt.show()

In [None]:
# appending results into new dataFrame
df_prob = pd.DataFrame()
df_prob['log_prob_noText'] = y_train_predict_proba_logNoText

In [None]:
# fitting into logistic regression model (Text)
log_clf_Text = LogisticRegression()
log_clf_Text.fit(x_train_Text,y_train_Text['state'])

In [None]:
# compare to calibration curve
y_train_predict_proba_logText = log_clf_Text.predict_proba(x_train_Text)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(y_train_Text['state'], y_train_predict_proba_logText, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, 's-',label='model performance')
plt.plot([0, 1], [0, 1], '--', color='gray',label='Perfectly Caliberated')
plt.xlabel("mean predicted value")
plt.ylabel("fraction of positives")
plt.legend()
plt.show()

In [None]:
# appending results into new dataFrame
df_prob['log_prob_Text'] = y_train_predict_proba_logText

In [None]:
# fitting into Tuned SVC (Text)
from sklearn.svm import SVC
support_clf = SVC(C= 0.1, gamma= 1, kernel= 'rbf',probability=True)
support_clf.fit(x_train_Text,y_train_Text['state'])

In [None]:
# compare to calibration curve
y_train_predict_proba_SVC = support_clf.predict_proba(x_train_Text)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(y_train_Text['state'], y_train_predict_proba_SVC, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, 's-',label='model performance')
plt.plot([0, 1], [0, 1], '--', color='gray',label='Perfectly Caliberated')
plt.xlabel("mean predicted value")
plt.ylabel("fraction of positives")
plt.legend()
plt.show()

In [None]:
# appending results into new dataFrame
df_prob['SVC_Tuned_prob'] = y_train_predict_proba_SVC

In [None]:
# fitting into Tuned XGB Classifier (No Text)
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(seed=0,
                       colsample_bytree= 0.5,
                       gamma= 0.1, 
                       learning_rate= 0.0001,
                       max_depth= 12,
                       reg_alpha= 1e-05, 
                       reg_lambda= 1e-05)

xgb_clf.fit(x_train_noText,y_train_noText['state'])

In [None]:
# compare to calibration curve
y_train_predict_proba_XGB_Tuned = xgb_clf.predict_proba(x_train_noText)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(y_train_noText['state'], y_train_predict_proba_XGB_Tuned, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, 's-',label='model performance')
plt.plot([0, 1], [0, 1], '--', color='gray',label='Perfectly Caliberated')
plt.xlabel("mean predicted value")
plt.ylabel("fraction of positives")
plt.legend()
plt.show()

In [None]:
# appending results into new dataFrame
df_prob['xgboost_Tuned_prob'] = y_train_predict_proba_Tuned


In [None]:
# fitting into default XGB Classifier (No Text)
xgb_clf = XGBClassifier(seed=0)
xgb_clf.fit(x_train_noText,y_train_noText['state'])

In [None]:
# compare to calibration curve
y_train_predict_proba = xgb_clf.predict_proba(x_train_noText)[:, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(y_train_noText['state'], y_train_predict_proba, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, 's-',label='model performance')
plt.plot([0, 1], [0, 1], '--', color='gray',label='Perfectly Caliberated')
plt.xlabel("mean predicted value")
plt.ylabel("fraction of positives")
plt.legend()
plt.show()

In [None]:
# appending results into new dataFrame
df_prob['xgboost_default_prob_noText'] = y_train_predict_proba

We see that out of all the models, the default xgboost classifier has the smallest deviation with a perfectly caliberated model. We will use the results as our predictor.

In [None]:
df_prob.head()

In [None]:
df_prob['amount_raised_usd_boxcox'] = y_train_noText['amount_raised_usd_boxcox']

In [None]:
from scipy.stats import kendalltau
from scipy.stats import spearmanr
def pearson_corr(x,y,data):
    corr = data.corr()[y][x]
    print('Pearson correlation: %.5f' % corr)
    
def kendall_rank_corr(x,y,data):
    corr, _ = kendalltau(data[x], data[y])
    print('Kendall Rank correlation: %.5f' % corr)
def spearman_corr(x,y,data):
    rho, _ = spearmanr(data[x], data[y])
    print('Spearman\'s Correlation: %.5f' % rho)

In [None]:
pearson_corr('xgboost_default_prob','amount_raised_usd_boxcox',df_prob)
kendall_rank_corr('xgboost_default_prob','amount_raised_usd_boxcox',df_prob)
spearman_corr('xgboost_default_prob','amount_raised_usd_boxcox',df_prob)

In [None]:
x_train_Text.head()

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

xgb_reg = XGBRegressor()
xgb_reg.fit(x_train_Text,y_train_Text)
pred = xgb_reg.predict(x_test_Text)
MSE = mean_squared_error(y_test_Text,pred)
MAE = mean_absolute_error(y_test_Text,pred)
r2 = r2_score(y_test_Text,pred)

In [None]:
print("MSE =", MSE)
print("MAE =",MAE)
print("Explained Variance =",r2)

In [None]:
xgb_reg = XGBRegressor()
xgb_reg.fit(x_train_noText,y_train_noText)
pred = xgb_reg.predict(x_test_noText)
MSE = mean_squared_error(y_test_noText,pred)
MAE = mean_absolute_error(y_test_noText,pred)
r2 = r2_score(y_test_noText,pred)

In [None]:
print("MSE =", MSE)
print("MAE =",MAE)
print("Explained Variance =",r2)

In [None]:
x_train_Text_reg = x_train_Text.reset_index()
x_train_Text_reg = x_train_Text_reg.join(df_prob['xgboost_default_prob'])
x_train_Text_reg.tail()

In [None]:
x_train_Text_reg = x_train_Text_reg.drop(['index'],axis=1)

In [None]:
y_train_Text_reg = y_train_Text.reset_index()
y_train_Text_reg = y_train_Text_reg.drop(['index','state'],axis=1)

In [None]:
x_test_Text_reg = x_test_Text.reset_index()
x_test_Text_reg = x_test_Text_reg.drop(['index'],axis=1)
xgb_clf = XGBClassifier(seed=0)
xgb_clf.fit(x_train_Text,y_train_Text)
x_test_Text_reg['xgboost_default_prob'] = xgb_clf(seed=0).predict_proba(x_test_Text)[:, 1]
y_test_Text_reg = y_test_Text.reset_index()
y_test_Text_reg = y_test_Text_reg.drop(['index','state'],axis=1)

In [None]:
pred = xgb_reg.predict(x_test_Text_reg)
MSE = mean_squared_error(y_test_Text_reg,pred)
MAE = mean_absolute_error(y_test_Text_reg,pred)
r2 = r2_score(y_test_Text_reg,pred)

print("MSE =", MSE)
print("MAE =",MAE)
print("Explained Variance =",r2)

In [None]:
x_test_Text.head()