In [34]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

In [2]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

## Functions

In [3]:
def create_modeling_data(this_df:[pd.DataFrame], target:[str]=None, scaler_used=StandardScaler(), test_set_size=0.3):
    if target not in this_df.columns:
        print(f"{target} not found in df provided.")
        return
    X = df.drop(target, axis = 1)
    y = df[target]
    X = X.select_dtypes(include=numerics)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size,random_state=42)
    scaler_used.fit(X_train)
    X_train_scaled=scaler_used.transform(X_train)
    X_test_scaled=scaler_used.transform(X_test)

    return (X_train_scaled, X_test_scaled, y_train, y_test)

In [52]:
def model_performance(data:[tuple] = None, model_lst:[list] = None):
    if not model_lst:
        print("No model list provided.")
        return
    if not data:
        print("No data provided.")
        return
    train_scores = []
    test_scores = []
    X_train, X_test, y_train, y_test = data
    for m in model_lst:
        model=m
        model.fit(X_train, y_train)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
        train_scores.append(model.score(X_train, y_train))
        test_scores.append(model.score(X_test, y_test))
        
    return pd.DataFrame(list(zip(model_lst,train_scores,test_scores)), columns =['model', 'training_score', 'test_score'])

In [40]:
def rec_feat_select(this_df = None, n_feat_select:[int] = 8):
   

    lm = LinearRegression()

    selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
    selector.fit(X_train, y_train)

    kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
    kept_features = list(X_train.iloc[:,kept_features].columns)

    X_train = selector.transform(X_train)
    X_test  = selector.transform(X_test)

    X_train = pd.DataFrame(X_train, columns=kept_features)
    X_test  = pd.DataFrame(X_test, columns=kept_features)

    print("Final selected features: ")
    display(X_train)


In [41]:
# this_data = create_modeling_data(this_df=df, target="total_claim_amount")

<function __main__.model_performance(data: [<class 'tuple'>] = None, model_lst: [<class 'list'>] = None)>

In [25]:
df = pd.read_csv("../data/cleaned_cust_data.csv")

In [55]:
x = model_performance(data=create_modeling_data(this_df=df, target="total_claim_amount"),model_lst = [LinearRegression(),
                                                                                                  Lasso(alpha=1),
                                                                                                  Ridge(alpha=1),
                                                                                                  ElasticNet(alpha=1)])

LinearRegression: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Lasso: Train -> 0.5207980591212458, Test -> 0.5192311459022065
Ridge: Train -> 0.5208845731786435, Test -> 0.5189725104445987
ElasticNet: Train -> 0.4620478080311433, Test -> 0.4565177782373415


In [62]:
this_data = data=create_modeling_data(this_df=df, target="total_claim_amount")
results = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=0.1), Ridge(alpha=0.1), ElasticNet(alpha=0.1)])
for _ in range(0,20):
    x = model_performance(data=this_data,model_lst = [LinearRegression(),
                                                      Lasso(alpha=_/2), Ridge(alpha=_/2), ElasticNet(alpha=_/2)])
    results = pd.concat([results, x], axis=0)

LinearRegression: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Lasso: Train -> 0.5208835190885917, Test -> 0.5190064228853604
Ridge: Train -> 0.5208845835646956, Test -> 0.5189737365443109
ElasticNet: Train -> 0.5195169309945639, Test -> 0.5171060288167005
LinearRegression: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Lasso: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Ridge: Train -> 0.5208845836696385, Test -> 0.5189738717256576
ElasticNet: Train -> 0.5208845836696385, Test -> 0.5189738717256576
LinearRegression: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Lasso: Train -> 0.5208615913188281, Test -> 0.5191198631732168
Ridge: Train -> 0.5208845810464295, Test -> 0.518973193714635
ElasticNet: Train -> 0.49870499829737636, Test -> 0.4945463972393207
LinearRegression: Train -> 0.5208845836696385, Test -> 0.5189738717256576
Lasso: Train -> 0.5207980591212458, Test -> 0.5192311459022065
Ridge: Train -> 0.5208845731786435, Test -> 0.5189725104445

In [8]:
df = pd.read_csv("../data/cleaned_cust_data.csv")
# df.head()

In [9]:
# df.head()

In [69]:
results.model.sort_values()

TypeError: '<' not supported between instances of 'Lasso' and 'LinearRegression'

In [67]:
results[results.model == "LinearRegression()"]

Unnamed: 0,model,training_score,test_score


In [80]:
X_added_constant = sm.add_constant(X)
# X_added_constant

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [82]:
# model = sm.OLS(y,X_added_constant).fit()
# model.summary()

In [36]:
model=LinearRegression()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.5208845836696386, Test -> 0.5189738717256577


In [37]:
model=Lasso(alpha=0.1)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.5205035767216227, Test -> 0.5197714491401642


In [38]:
model=Ridge(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.5208845836696292, Test -> 0.5189738716178669


In [39]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.5208841472497661, Test -> 0.5189726190526929
