In [34]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

In [2]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

## Functions

In [3]:
def create_modeling_data(this_df:[pd.DataFrame], target:[str]=None, scaler_used=StandardScaler(), test_set_size=0.3):
    if target not in this_df.columns:
        print(f"{target} not found in df provided.")
        return
    X = df.drop(target, axis = 1)
    y = df[target]
    X = X.select_dtypes(include=numerics)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size,random_state=42)
    scaler_used.fit(X_train)
    X_train_scaled=scaler_used.transform(X_train)
    X_test_scaled=scaler_used.transform(X_test)

    return (X_train_scaled, X_test_scaled, y_train, y_test)

In [120]:
def model_performance(data:[tuple] = None, model_lst:[list] = None):
    if not model_lst:
        print("No model list provided.")
        return
    if not data:
        print("No data provided.")
        return
    train_scores = []
    test_scores = []
    model_names = []
    alpha_val =[] 
    X_train, X_test, y_train, y_test = data
    for m in model_lst:
        model=m
        model.fit(X_train, y_train)
        # print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
        model_names.append(str(m).split("(")[0])
        # if str(m).split("(")[1][0] == ")":
        if str(m) == "LinearRegression()":
            alpha_val.append("---")
        else:
            if str(m).split("(")[0][0] == ")":
                print("1")
                alpha_val.append(1)
            else:
                alpha_val.append(str(m).split("(")[1].split(")")[0].split("=")[1].split(")")[0])
        train_scores.append(model.score(X_train, y_train))
        test_scores.append(model.score(X_test, y_test))
        
    return pd.DataFrame(list(zip(model_names,alpha_val,train_scores,test_scores)), columns =['model', 'alpha', 'training_score', 'test_score'])

In [81]:
def rec_feat_select(this_df = None, n_feat_select:[int] = 8):
   

    lm = LinearRegression()

    selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
    selector.fit(X_train, y_train)

    kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
    kept_features = list(X_train.iloc[:,kept_features].columns)

    X_train = selector.transform(X_train)
    X_test  = selector.transform(X_test)

    X_train = pd.DataFrame(X_train, columns=kept_features)
    X_test  = pd.DataFrame(X_test, columns=kept_features)

    print("Final selected features: ")
    display(X_train)

    #---
    selector = RFE(LinearRegression(), step=1)
    selector = selector.fit(X, y)

    X_rfe = X[X.columns[selector.support_]]
    X_rfe.shape

In [41]:
# this_data = create_modeling_data(this_df=df, target="total_claim_amount")

<function __main__.model_performance(data: [<class 'tuple'>] = None, model_lst: [<class 'list'>] = None)>

In [25]:
df = pd.read_csv("../data/cleaned_cust_data.csv")

In [115]:
x = model_performance(data=create_modeling_data(this_df=df, target="total_claim_amount"),model_lst = [LinearRegression(),
                                                                                                  Lasso(alpha=1),
                                                                                                  Ridge(alpha=1),
                                                                                                  ElasticNet(alpha=1)])

In [121]:
this_data = data=create_modeling_data(this_df=df, target="total_claim_amount")
results = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=0.1), Ridge(alpha=0.1), ElasticNet(alpha=0.1)])
for _ in range(0,20):
    x = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=_/2), Ridge(alpha=_/2), ElasticNet(alpha=_/2)])
    results = pd.concat([results, x], axis=0)

IndexError: list index out of range

In [8]:
df = pd.read_csv("../data/cleaned_cust_data.csv")
# df.head()

In [9]:
# df.head()

In [None]:
def rfe1():
    X = data.drop('total_claim_amount', axis=1)
    y = np.log(data.total_claim_amount)

    X = X.select_dtypes(include=np.number)
    X_added_constant = sm.add_constant(X)
    ols_model = sm.OLS(y,X_added_constant).fit()

    ## drop features with pval < 0.05
    pv = ols_model.pvalues
    X_pfiltered = X_added_constant[pv[pv < 0.05].index].drop('const', axis=1).reset_index(drop=True)
    X_pfiltered.shape



In [117]:
results # [results.model == "LinearRegression()"]

Unnamed: 0,model,alpha,training_score,test_score
0,LinearRegression,---,0.520885,0.518974
1,Lasso,alpha=0.1),0.520884,0.519006
2,Ridge,alpha=0.1),0.520885,0.518974
3,ElasticNet,alpha=0.1),0.519517,0.517106
0,LinearRegression,---,0.520885,0.518974
...,...,...,...,...
3,ElasticNet,alpha=9.0),0.178118,0.173399
0,LinearRegression,---,0.520885,0.518974
1,Lasso,alpha=9.5),0.518222,0.516409
2,Ridge,alpha=9.5),0.520884,0.518960


In [129]:
model_lst = [LinearRegression(),
                                                                                                  Lasso(alpha=2),
                                                                                                  Ridge(alpha=2),
                                                                                                  ElasticNet(alpha=2)]

In [125]:
for m in model_lst:
    print(str(m).split("(")[0], end="")
    if str(m) == "LinearRegression()":
        print("---")
    else:
        print(str(m).split("(")[1].split(")")[0].replace("alpha=",""))

LinearRegression---
Lasso1
Ridge1
ElasticNet1


In [130]:
for m in model_lst:
    if str(m) == "LinearRegression()":
            print("---")
    else:
        if str(m).split("(")[0][0] == ")":
            print("1")
            # alpha_val.append(1)
        else:
            print(str(m).split("(")[1].split(")")[0].split("=")[1].split(")")[0])

---
2
2
2


In [96]:
results.sort_values(["model","alpha"])

Unnamed: 0,model,alpha,training_score,test_score
3,ElasticNet,---,0.462048,0.456518
3,ElasticNet,0.0,0.520885,0.518974
3,ElasticNet,0.1,0.519517,0.517106
3,ElasticNet,0.5,0.498705,0.494546
3,ElasticNet,1.5,0.425750,0.419493
...,...,...,...,...
2,Ridge,7.5,0.520884,0.518963
2,Ridge,8.0,0.520884,0.518962
2,Ridge,8.5,0.520884,0.518962
2,Ridge,9.0,0.520884,0.518961


In [104]:
elast = results[results.model == "Ridge"]

In [105]:
elast

Unnamed: 0,model,alpha,training_score,test_score
2,Ridge,0.1,0.520885,0.518974
2,Ridge,0.0,0.520885,0.518974
2,Ridge,0.5,0.520885,0.518973
2,Ridge,---,0.520885,0.518973
2,Ridge,1.5,0.520885,0.518972
2,Ridge,2.0,0.520885,0.518971
2,Ridge,2.5,0.520885,0.51897
2,Ridge,3.0,0.520884,0.51897
2,Ridge,3.5,0.520884,0.518969
2,Ridge,4.0,0.520884,0.518968


In [80]:
X_added_constant = sm.add_constant(X)
# X_added_constant

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [82]:
# model = sm.OLS(y,X_added_constant).fit()
# model.summary()

In [36]:
model=LinearRegression()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.5208845836696386, Test -> 0.5189738717256577


In [37]:
model=Lasso(alpha=0.1)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.5205035767216227, Test -> 0.5197714491401642


In [38]:
model=Ridge(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.5208845836696292, Test -> 0.5189738716178669


In [39]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.5208841472497661, Test -> 0.5189726190526929
