In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
import pickle

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [136]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# function to train-test-split data and treat it

def split_and_treat_data(X, y, encode_cats=True, scale_nums=True, randomstate=None):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=randomstate)

    # transforming numericals
    if scale_nums:
        transformer = MinMaxScaler().fit(X_train.select_dtypes(np.number))

        cols_nums = X.select_dtypes(np.number).columns
        
        X_train_norm = pd.DataFrame(transformer.transform(X_train.select_dtypes(np.number)), columns=cols_nums)
        X_test_norm = pd.DataFrame(transformer.transform(X_test.select_dtypes(np.number)), columns=cols_nums)
    else:
        X_train_norm = X_train.select_dtypes(np.number)
        X_test_norm = X_test.select_dtypes(np.number)

    # encoding categoricals
    if encode_cats:
        encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(pd.DataFrame(X_train.select_dtypes(object)))

        encoded_train = encoder.transform(pd.DataFrame(X_train.select_dtypes(object))).toarray()
        encoded_test = encoder.transform(pd.DataFrame(X_test.select_dtypes(object))).toarray()

        cols_cats = encoder.get_feature_names_out(input_features=X_train.select_dtypes(object).columns)

        onehot_encoded_cats_train = pd.DataFrame(encoded_train, columns=cols_cats).astype(object)
        onehot_encoded_cats_test = pd.DataFrame(encoded_test, columns=cols_cats).astype(object)
    else:
        onehot_encoded_cats_train = X_train.select_dtypes(object)
        onehot_encoded_cats_test = X_test.select_dtypes(object)


    # concat cats + nums back together
    X_train_treated = pd.concat([X_train_norm, onehot_encoded_cats_train], axis=1)
    X_test_treated = pd.concat([X_test_norm, onehot_encoded_cats_test], axis=1)

    return X_train_treated.reset_index(drop=True), X_test_treated, y_train.reset_index(drop=True), y_test

In [137]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.neural_network import MLPRegressor

In [138]:
# read in data
nums = pd.read_csv('files_for_lab/var_nums.csv')
cats = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')

has_donated = targets['TARGET_B']
y = targets['TARGET_D']
X = pd.concat([cats, nums], axis=1)

In [139]:
# filter donors
X_donors = X[has_donated == 1]
y_donors = y[has_donated == 1]

In [140]:
# scale and encode data, concat back together (no testing here)
X_train, X_test, y_train, y_test = split_and_treat_data(X_donors, y_donors) 

In [141]:
# regr = DecisionTreeRegressor()

# grid = {'max_depth': [3,5,10,20,None],
#         'criterion': ['squared_error','absolute_error'],
#         'min_samples_split': [2,5,10,20],
#         'min_samples_leaf': [2,5,10,20]}

# grid_search = GridSearchCV(estimator=regr, param_grid=grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [142]:
# grid_search.best_params_

In [143]:
# scale and encode data, concat back together (no testing here)
xtr,xte,ytr,yte = split_and_treat_data(X,y) # we only take var_nums here without cats
X_treated = pd.concat([xtr,xte], axis=0)

# load classified data (predictions)
preds = pd.read_csv('files_for_lab/predictions.csv')

In [144]:
from sklearn.model_selection import cross_validate

def fit_predict(model, X_train, y_train, X_test, y_test):
    model_name = str(model).split('(')[0]
    model.fit(X_train, y_train)

    # add prediction to target data
    preds[model_name] = model.predict(X_treated)
    preds[model_name] = preds[model_name].apply(lambda x: round(x, 2))

    print('score', model_name, '>>>', model.score(X_test, y_test))

In [145]:
m = LinearRegression()
m.fit(X_train, y_train)
m.score(X_test, y_test)

0.42323709744694094

In [146]:
models = [
    DecisionTreeRegressor(criterion= 'squared_error', max_depth=3, min_samples_leaf=5, min_samples_split=2),
    LinearRegression(), KNeighborsRegressor(n_neighbors=3), SGDRegressor(), ElasticNetCV(), MLPRegressor(),
    ]


In [147]:
for regressor in models:
    fit_predict(regressor, X_train, y_train, X_test, y_test)

score DecisionTreeRegressor >>> 0.357270569564633
score LinearRegression >>> 0.42323709744694094
score KNeighborsRegressor >>> 0.11025279795429133
score SGDRegressor >>> 0.42536143647238445
score ElasticNetCV >>> 0.4300333204900022
score MLPRegressor >>> 0.3841126949777013


In [148]:
preds[preds['TARGET_B'] == 1]['MLPRegressor'].sum()

90938.69

In [150]:
preds[(preds['TARGET_B'] == 1)&(preds['PREDICT_B_2'] == 1)]

Unnamed: 0,TARGET_B,TARGET_D,PREDICT_B_2,DecisionTreeRegressor,LinearRegression,KNeighborsRegressor,SGDRegressor,ElasticNetCV,MLPRegressor
30,1,7.0,1,26.97,33.71,10.67,31.34,31.63,36.80
273,1,10.0,1,10.64,10.26,13.33,10.68,10.09,12.40
633,1,11.0,1,10.64,9.55,10.67,9.94,9.45,9.92
738,1,4.0,1,18.70,16.85,18.33,16.40,16.11,17.52
794,1,20.0,1,18.70,17.32,14.67,18.00,17.27,21.60
...,...,...,...,...,...,...,...,...,...
93896,1,25.0,1,18.70,18.26,18.67,18.19,18.03,16.35
93973,1,12.0,1,15.83,14.66,25.33,14.86,14.61,18.40
94683,1,26.0,1,15.83,17.57,19.33,17.35,17.45,15.20
94828,1,30.0,1,15.83,16.89,19.67,15.55,15.50,17.87


In [151]:
preds.to_csv('files_for_lab/predictions.csv', index=False)