
# Linear Regression
- X-y split (y is the target variable, in this case, "total claim amount")
- Train-test split.
- Standardize the data (after the data split!) *on the training set*.
- Apply linear regression.
- Model Interpretation.
- make a dataframe to show real vs perdicted


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
file = 'https://raw.githubusercontent.com/Shacham-R/Shacham.R.Labs-Ironhack/main/Week4/Labs/marketing_customer_analysis_clean.csv'
df = pd.read_csv(file)
df = df.dropna()

In [3]:
num = df.select_dtypes('number')
num.drop(columns=['unnamed:_0'],inplace=True)
num.drop(axis=1,columns=['total_claim_amount'],inplace=True)

In [4]:
num.head(1)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month
0,4809.21696,48029,61,7.0,52,0.0,9,2


## Preproccessing  - categorical variables encoding

In [5]:
cat = df.select_dtypes(exclude='number')
cat.drop(columns=['customer','state','effective_to_date'],inplace=True,errors='Ignore')
cat_encoded = pd.get_dummies(cat,drop_first=True)
cat_encoded.head(1)

Unnamed: 0,response_Yes,coverage_Extended,coverage_Premium,education_College,education_Doctor,education_High School or Below,education_Master,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,False,False,False,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False


In [6]:
final_df = pd.concat([num, cat_encoded],axis=1)

### Xy split

In [7]:
#splitting the data

X = final_df #the numericals columns, apart from 'total_claim_amount'
y = df['total_claim_amount']

In [8]:
X.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month,response_Yes,coverage_Extended,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,4809.21696,48029,61,7.0,52,0.0,9,2,False,False,...,False,False,False,False,False,False,False,False,True,False
1,2228.525238,0,64,3.0,26,0.0,1,1,False,False,...,False,True,False,False,False,False,False,False,True,False
2,14947.9173,22139,100,34.0,31,0.0,2,2,False,False,...,False,True,False,False,False,True,False,False,True,False
3,22332.43946,49078,97,10.0,3,0.0,2,1,True,True,...,True,False,False,False,False,False,False,False,True,False
4,9025.067525,23675,117,15.149071,31,0.384256,7,1,False,False,...,True,False,False,False,False,False,False,False,True,False


# Creating the model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 100)

### Scaling

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X_train) #fitting for the training data
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [12]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
y_traina = pd.array(y_train).reshape(-1,1)
pt.fit(y_traina)
y_train_t = pt.transform(y_traina)
y_test_t = pt.transform(pd.array(y_test).reshape(-1,1))
y_test = y_test_t
y_train = y_train_t

## Modeling

In [13]:
lm = LinearRegression()

In [14]:
model = lm.fit(X_train, y_train)

# Real vs. Predicted dataframe

In [17]:
y_pred = np.array(model.predict(X_test))

In [18]:
#y_test.reset_index(drop=True)

In [19]:
real_vs_pred = pd.DataFrame({'y_test': y_test, 'y_pred':y_pred})

ValueError: Per-column arrays must each be 1-dimensional

In [None]:
real_vs_pred

In [None]:
real_vs_pred['residuals'] = real_vs_pred.apply(lambda row: row.y_test - row.y_pred, axis=1)
real_vs_pred.head()

In [20]:
from sklearn.metrics import mean_squared_error as mse, r2_score as r2

In [21]:
model_mse = mse(y_test, y_pred)
model_r2 = r2(y_test, y_pred)
model_rmse = np.sqrt(model_mse)

In [22]:
model_r2, model_mse, model_rmse

(0.8320905409975645, 0.1720729586509249, 0.4148167772052197)

### Feature importence

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
fi_df = pd.DataFrame({'var':X_train.columns, 'coef':model.coef_})

In [None]:
plt.bar(x=fi_df["var"],height=fi_df["coef"])
plt.xticks(rotation=45)

In [None]:
plt.scatter(x=y_test,y=y_pred)

In [None]:
#pt.inverse_transform(predicted) # to get the actual, non transformed value

# LAB | 27.11.23 Instructions
- Fit the models LinearRegression, Lasso and Ridge and compare the model performances.
- Define a function that takes a list of models and trains (and tests) them so we can try a lot of them without repeating code.
- Use feature selection techniques (P-Value, RFE) to select a subset of features to train the model with.
- (optional) Refit the models with the selected features.

In [37]:
from sklearn.linear_model import Lasso,Ridge,LinearRegression

def models_to_test(X_train, y_train,X_test, y_test):
    linearregression(X_train, y_train,X_test, y_test)
    ridge(X_train, y_train,X_test, y_test)
    lasso(X_train, y_train,X_test, y_test)
    
def model_metrics(model, y_test, y_pred):
    model_mse = mse(y_test, y_pred)
    model_r2 = r2(y_test, y_pred)
    model_rmse = np.sqrt(model_mse)
    return model_mse,model_r2,model_rmse


def linearregression(X_train, y_train,X_test, y_test):
    model=LinearRegression()
    
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
    y_pred = model.predict(X_test)
    metrics = model_metrics(model, y_test, y_pred)
    print(f"MSE:{metrics[0]}, R^2:{metrics[1]}, RMSE:{metrics[2]}")


def lasso(X_train, y_train,X_test, y_test):
    model=Lasso(alpha=1)
    
    model.fit(X_train, y_train,X_test, y_test)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
    y_pred = model.predict(X_test)
    metrics = model_metrics(model, y_test, y_pred)
    print(f"MSE:{metrics[0]}, R^2:{metrics[1]}, RMSE:{metrics[2]}")

def ridge(X_train, y_train,X_test, y_test):
    model=Ridge(alpha=1)
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")
    y_pred = model.predict(X_test)
    metrics = model_metrics(model, y_test, y_pred)
    print(f"MSE:{metrics[0]}, R^2:{metrics[1]}, RMSE:{metrics[2]}")


In [38]:
models_to_test(X_train, y_train,X_test, y_test)

LinearRegression: Train -> 0.832673673081055, Test -> 0.8320905409975645
MSE:0.1720729586509249, R^2:0.8320905409975645, RMSE:0.4148167772052197
Ridge: Train -> 0.8328920074018028, Test -> 0.8322565358841862
MSE:0.17190284773858183, R^2:0.8322565358841862, RMSE:0.4146116830705351


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()