In [70]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [29]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

## Functions

In [30]:
def create_modeling_data(this_df:[pd.DataFrame], target:[str]=None, scaler_used=StandardScaler(), test_set_size=0.3):
    if target not in this_df.columns:
        print(f"{target} not found in df provided.")
        return
    X = df.drop(target, axis = 1)
    y = df[target]
    X = X.select_dtypes(include=numerics)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size,random_state=42)
    scaler_used.fit(X_train)
    X_train_scaled=scaler_used.transform(X_train)
    X_test_scaled=scaler_used.transform(X_test)

    return (X_train_scaled, X_test_scaled, y_train, y_test)

In [31]:
# def model_performance(data:[tuple] = None, model_lst:[list] = None):
#     if not model_lst:
#         print("No model list provided.")
#         return
#     if not data:
#         print("No data provided.")
#         return
#     train_scores = []
#     test_scores = []
#     model_names = []
#     alpha_val =[] 
#     X_train, X_test, y_train, y_test = data
#     for m in model_lst:
        
#         model=m
#         print(str(m))
#         model.fit(X_train, y_train)
#         # print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
#         model_names.append(str(m).split("(")[0])
#         # if str(m).split("(")[1][0] == ")":
#         if str(m) == "LinearRegression()":
#             alpha_val.append("---")
#         else:
#             if str(m).split("(")[0][0] == ")":
#                 alpha_val.append(1)
#             else:
#                 alpha_val.append(str(m).split("(")[1].split(")")[0].split("=")[1])
#         train_scores.append(model.score(X_train, y_train))
#         test_scores.append(model.score(X_test, y_test))
        
#     return pd.DataFrame(list(zip(model_names,alpha_val,train_scores,test_scores)), columns =['model', 'alpha', 'training_score', 'test_score'])

In [43]:
def split_column_names_by(this_df, include_dtypes:[list]=["int","float"]):
    # __author__ = "Roman Kaltschew"
    """takes in this_df, splits this_df's columns into two lists of column name by dtype of columns (default ["int","float"]) 
       and returns tuple (incl_lst, excl_lst)"""
    incl_lst = [_ for _ in this_df.select_dtypes(include=include_dtypes).dtypes.index]
    excl_lst = [_ for _ in this_df.select_dtypes(exclude=include_dtypes).dtypes.index]
    return (incl_lst, excl_lst)

In [32]:
def rec_feat_select(this_df = None, n_feat_select:[int] = 8):
   

    lm = LinearRegression()

    selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
    selector.fit(X_train, y_train)

    kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
    kept_features = list(X_train.iloc[:,kept_features].columns)

    X_train = selector.transform(X_train)
    X_test  = selector.transform(X_test)

    X_train = pd.DataFrame(X_train, columns=kept_features)
    X_test  = pd.DataFrame(X_test, columns=kept_features)

    print("Final selected features: ")
    display(X_train)

    #---
    selector = RFE(LinearRegression(), step=1)
    selector = selector.fit(X, y)

    X_rfe = X[X.columns[selector.support_]]
    X_rfe.shape

In [72]:
def nan_values_of_all_columns(this_df, return_dict:[bool] = False):
    """prints (defaults) or returns dictionary of all columns in this_df and their respective nan-values"""
    if not return_dict:
        for col in this_df.columns:
            print(col, this_df[col].isna().sum())
        return
    return {col:this_df[col].isna().sum() for col in this_df.columns}

In [68]:
def model_performance(y_train, y_pred_train, y_test, y_pred_test):

    ME_train = np.mean(np.exp(y_train)-np.exp(y_pred_train))
    ME_test  = np.mean(np.exp(y_test)-np.exp(y_pred_test))

    MAE_train = mean_absolute_error(np.exp(y_train),np.exp(y_pred_train))
    MAE_test  = mean_absolute_error(np.exp(y_test),np.exp(y_pred_test))

    MSE_train = mean_squared_error(np.exp(y_train),np.exp(y_pred_train))
    MSE_test  = mean_squared_error(np.exp(y_test),np.exp(y_pred_test))

    RMSE_train = np.sqrt(MSE_train)
    RMSE_test  = np.sqrt(MSE_test)

    MAPE_train = np.mean((np.abs(np.exp(y_train)-np.exp(y_pred_train)) / np.exp(y_train))* 100.)
    MAPE_test  = np.mean((np.abs(np.exp(y_test)-np.exp(y_pred_test)) / np.exp(y_test))* 100.)

    R2_train = r2_score(np.exp(y_train),np.exp(y_pred_train))
    R2_test  = r2_score(np.exp(y_test),np.exp(y_pred_test))

    performance = pd.DataFrame({'Error_metric': ['Mean error','Mean absolute error','Mean squared error',
                                             'Root mean squared error','Mean absolute percentual error',
                                             'R2'],
                            'Train': [ME_train, MAE_train, MSE_train, RMSE_train, MAPE_train, R2_train],
                            'Test' : [ME_test, MAE_test , MSE_test, RMSE_test, MAPE_test, R2_test]})

    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': np.exp(y_train), 'Predicted': np.exp(y_pred_train)})
    df_test  = pd.DataFrame({'Real': np.exp(y_test),  'Predicted': np.exp(y_pred_test)})

    return performance, df_train, df_test

In [33]:
# this_data = create_modeling_data(this_df=df, target="total_claim_amount")

In [34]:
df = pd.read_csv("../data/cleaned_cust_data.csv")

In [12]:
x = model_performance(data=create_modeling_data(this_df=df, target="total_claim_amount"),model_lst = [LinearRegression(),
                                                                                                  Lasso(alpha=1),
                                                                                                  Ridge(alpha=1),
                                                                                                  ElasticNet(alpha=1)])

In [35]:
this_data = data=create_modeling_data(this_df=df, target="total_claim_amount")
results = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=0.1), Ridge(alpha=0.1), ElasticNet(alpha=0.1)])
for _ in range(3,20):
    x = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=_/2), Ridge(alpha=_/2), ElasticNet(alpha=_/2)])
    results = pd.concat([results, x], axis=0)

LinearRegression()
Lasso(alpha=0.1)
Ridge(alpha=0.1)
ElasticNet(alpha=0.1)
3 1.5
LinearRegression()
Lasso(alpha=1.5)
Ridge(alpha=1.5)
ElasticNet(alpha=1.5)
4 2.0
LinearRegression()
Lasso(alpha=2.0)
Ridge(alpha=2.0)
ElasticNet(alpha=2.0)
5 2.5
LinearRegression()
Lasso(alpha=2.5)
Ridge(alpha=2.5)
ElasticNet(alpha=2.5)
6 3.0
LinearRegression()
Lasso(alpha=3.0)
Ridge(alpha=3.0)
ElasticNet(alpha=3.0)
7 3.5
LinearRegression()
Lasso(alpha=3.5)
Ridge(alpha=3.5)
ElasticNet(alpha=3.5)
8 4.0
LinearRegression()
Lasso(alpha=4.0)
Ridge(alpha=4.0)
ElasticNet(alpha=4.0)
9 4.5
LinearRegression()
Lasso(alpha=4.5)
Ridge(alpha=4.5)
ElasticNet(alpha=4.5)
10 5.0
LinearRegression()
Lasso(alpha=5.0)
Ridge(alpha=5.0)
ElasticNet(alpha=5.0)
11 5.5
LinearRegression()
Lasso(alpha=5.5)
Ridge(alpha=5.5)
ElasticNet(alpha=5.5)
12 6.0
LinearRegression()
Lasso(alpha=6.0)
Ridge(alpha=6.0)
ElasticNet(alpha=6.0)
13 6.5
LinearRegression()
Lasso(alpha=6.5)
Ridge(alpha=6.5)
ElasticNet(alpha=6.5)
14 7.0
LinearRegression()
Lass

In [36]:
df = pd.read_csv("../data/cleaned_cust_data.csv")
# df.head()

In [37]:
x

Unnamed: 0,model,alpha,training_score,test_score
0,LinearRegression,---,0.520885,0.518974
1,Lasso,9.5,0.518222,0.516409
2,Ridge,9.5,0.520884,0.51896
3,ElasticNet,9.5,0.171127,0.166552


In [None]:
def rfe1():
    X = data.drop('total_claim_amount', axis=1)
    y = np.log(data.total_claim_amount)

    X = X.select_dtypes(include=np.number)
    X_added_constant = sm.add_constant(X)
    ols_model = sm.OLS(y,X_added_constant).fit()

    ## drop features with pval < 0.05
    pv = ols_model.pvalues
    X_pfiltered = X_added_constant[pv[pv < 0.05].index].drop('const', axis=1).reset_index(drop=True)
    X_pfiltered.shape



In [38]:
results # [results.model == "LinearRegression()"]

Unnamed: 0,model,alpha,training_score,test_score
0,LinearRegression,---,0.520885,0.518974
1,Lasso,0.1,0.520884,0.519006
2,Ridge,0.1,0.520885,0.518974
3,ElasticNet,0.1,0.519517,0.517106
0,LinearRegression,---,0.520885,0.518974
...,...,...,...,...
3,ElasticNet,9.0,0.178118,0.173399
0,LinearRegression,---,0.520885,0.518974
1,Lasso,9.5,0.518222,0.516409
2,Ridge,9.5,0.520884,0.518960


## KNN

In [54]:
df = pd.read_csv("../data/cleaned_cust_data.csv")
# df.head()

In [55]:
df_num = df[split_column_names_by(df)[0]]
df_cat = df[split_column_names_by(df)[1]]

In [56]:
cat_cols=pd.get_dummies(df_cat, drop_first=True)

In [59]:
X=df_num.drop("total_claim_amount", axis=1)
y=df_num["total_claim_amount"]

In [60]:
# X = pd.concat([X,cat_cols],axis=1)

In [61]:
X

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month
0,4809,48029,61,7,52,0.0,9,2
1,2228,0,64,3,26,0.0,1,1
2,14947,22139,100,34,31,0.0,2,2
3,22332,49078,97,10,3,0.0,2,1
4,9025,23675,117,15,31,0.0,7,1
...,...,...,...,...,...,...,...,...
10905,15563,0,253,15,40,0.0,7,1
10906,5259,61146,65,7,68,0.0,6,1
10907,23893,39837,201,11,63,0.0,2,2
10908,11971,64195,158,0,27,4.0,6,2


In [62]:
scaler_used = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
scaler_used.fit(X_train)
X_train_scaled=scaler_used.transform(X_train)
X_test_scaled=scaler_used.transform(X_test)

# training datasets to DataFrame again to manipulate them
X_train = pd.DataFrame(X_train_scaled, columns= X.columns)
X_test = pd.DataFrame(X_test_scaled, columns= X.columns)

In [84]:
# np.unique(y_pred_test_knn, return_counts=True)   # check values of np.array
array_sum = np.sum(y_pred_test_knn)
array_has_nan = np.isnan(array_sum)
array_has_nan

False

In [80]:
neigh = KNeighborsRegressor(n_neighbors=10)

neigh.fit(X_train_scaled, y_train)

# make predictions
y_pred_train_knn = neigh.predict(X_train)
y_pred_test_knn  = neigh.predict(X_test)

performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn)
performance_knn

full = pd.DataFrame()

models = {'k': [] }

for k in range(2,21):

    neigh = KNeighborsRegressor(n_neighbors=k)
    neigh.fit(X_train, y_train)

    models['k'] = [k, neigh]

    y_pred_train_knn = neigh.predict(X_train)
    y_pred_test_knn  = neigh.predict(X_test)

    performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn)
    temp = pd.DataFrame({'k': [k]*6, 'Error_metric': performance_knn['Error_metric'], 
                         'Train': performance_knn['Train'], 'Test': performance_knn['Test']})
    full = pd.concat([full,temp], axis=0)

full[full['Error_metric'] == 'R2']

full2 = full.melt(id_vars=['k','Error_metric'])
full2

full2["Error_metric"]

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#metrics = ['Mean error',]'Mean absolute error',...]

fig, ax = plt.subplots(2,3, figsize=(20,10))
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean error'], hue = 'variable', ax = ax[0,0])
ax[0,0].set_xticks(range(2,21))
ax[0,0].set_title("Mean error")
ax[0,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean absolute error'], hue = 'variable', ax = ax[0,1])
ax[0,1].set_xticks(range(2,21))
ax[0,1].set_title("Mean absolute error")
ax[0,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean squared error'], hue = 'variable', ax = ax[0,2])
ax[0,2].set_xticks(range(2,21))
ax[0,2].set_title("Mean squared error")
ax[0,2].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Root mean squared error'], hue = 'variable', ax = ax[1,0])
ax[1,0].set_xticks(range(2,21))
ax[1,0].set_title("Root mean squared error")
ax[1,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean absolute percentual error'], hue = 'variable', ax = ax[1,1])
ax[1,1].set_xticks(range(2,21))
ax[1,1].set_title("Mean absolute percentual error")
ax[1,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'R2'], hue = 'variable', ax = ax[1,2])
ax[1,2].set_xticks(range(2,21))
ax[1,2].set_title("R2")
ax[1,2].legend(loc='lower right')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
X = df.drop(columns=["total_claim_amount"], axis = 1)
y = df["total_claim_amount"]

In [24]:
yy = "Lasso()"

In [26]:
print(yy.split("(")[1][0])

)


In [80]:
X_added_constant = sm.add_constant(X)
# X_added_constant

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [82]:
# model = sm.OLS(y,X_added_constant).fit()
# model.summary()

In [36]:
model=LinearRegression()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.5208845836696386, Test -> 0.5189738717256577


In [37]:
model=Lasso(alpha=0.1)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.5205035767216227, Test -> 0.5197714491401642


In [38]:
model=Ridge(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.5208845836696292, Test -> 0.5189738716178669


In [39]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.5208841472497661, Test -> 0.5189726190526929
