# Lab | Customer Analysis Round 7

In [None]:
# To improve the linear regression mode

Let's start by loading and cleaning the data. We'll create a function for data cleaning that includes steps from previous rounds.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the data
file_path = 'marketing_customer_analysis.csv'
df = pd.read_csv(file_path)

def clean_data(df):
    # Dropping irrelevant columns
    df = df.drop(columns=['customer', 'effective_to_date'])
    
    # Fixing column names
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    # Converting data types
    df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce')
    df['vehicle_class'] = df['vehicle_class'].astype('category')
    
    # Handling missing values
    df = df.dropna()
    
    return df

df = clean_data(df)


We'll check for multicollinearity, apply transformations to numerical features, and prepare for encoding categorical features.

In [None]:
# Checking for multicollinearity
def calculate_vif(df, features):
    X = df[features]
    X['intercept'] = 1  # Add an intercept term for VIF calculation
    vif = pd.DataFrame()
    vif['variable'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif.drop(index=X.columns.get_loc('intercept'))

# Select numerical features for VIF calculation
numerical_features = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_policies']

vif_df = calculate_vif(df, numerical_features)
print(vif_df)

# Drop features with high VIF (VIF > 10 indicates high multicollinearity)
df = df.drop(columns=['customer_lifetime_value'])

# Transforming numerical features to be more normally distributed
numerical_features = ['income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_policies']
pt = PowerTransformer()
df[numerical_features] = pt.fit_transform(df[numerical_features])

# OneHotEncoding categorical features
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)


In [None]:
# Defining the features and target variable
X = df.drop(columns=['total_claim_amount'])
y = df['total_claim_amount']

# Different train-test splits
splits = [0.3, 0.2, 0.1]

results = []

for test_size in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    
    # Standard scaling of numerical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features)
        ], remainder='passthrough'
    )
    
    # Linear regression model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    # Training the model
    model.fit(X_train, y_train)
    
    # Making predictions
    y_pred = model.predict(X_test)
    
    # Model validation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({'test_size': test_size, 'MSE': mse, 'R2': r2})

results_df = pd.DataFrame(results)
print(results_df)


In [None]:
Summary
This code:

1.Loads and cleans the data.
2.Checks and handles multicollinearity.
3.Applies transformations to numerical features.
4.Encodes categorical features using OneHotEncoder.
5.Builds and evaluates linear regression models with different train-test splits.