# Lab | Customer Analysis Final Round

1.Problem (case study)


Data Description:
The dataset marketing_customer_analysis.csv contains various features related to customer demographics, insurance policy details, and customer interactions.

Goal:
The goal is to predict the Total Claim Amount using a linear regression model and validate the model using various metrics such as R², MSE, RMSE, and MAE.

In [4]:
#2.Getting Data
# Read the .csv file:
    
   import pandas as pd

file_path = 'path_to_your_file/marketing_customer_analysis.csv'
data = pd.read_csv(file_path)
 

In [None]:
#3 - Cleaning/Wrangling/EDA
# Change headers names:

data.columns = data.columns.str.lower().str.replace(' ', '_')


In [None]:
# Deal with NaN values:

# Fill numerical columns with the median
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

# Fill categorical columns with the most frequent value
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].value_counts().idxmax(), inplace=True)



In [None]:
# Categorical Features and Numerical Features:

categorical_features = data.select_dtypes(include=['object']).columns.tolist()
numerical_features = data.select_dtypes(exclude=['object']).columns.tolist()
numerical_features.remove('total_claim_amount')  # Exclude target variable from features


In [None]:
# Exploration:

data.describe()  # Summary statistics
data.info()  # Data types and missing values


In [None]:
# 4 - Processing Data
# Dealing with outliers:

# Assuming using IQR method for numerical features
for col in numerical_features:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]


In [None]:
# Normalization:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])


In [None]:
# Encoding Categorical Data:

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])


In [None]:
# Splitting into train set and test set:

from sklearn.model_selection import train_test_split

X = data.drop(columns=['total_claim_amount'])
y = data['total_claim_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 5 - Modeling
# Apply model:

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [None]:
#6 - Model Validation
# R2, MSE, RMSE, MAE:

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²) score: {r2}")


7 - Reporting
Present results:

Presenting the results using a summary of the metrics calculated:

1.Mean Squared Error (MSE): Measures the average of the squares of the errors.
2.Root Mean Squared Error (RMSE): The square root of the MSE, providing error in the same units as the target variable.
3.Mean Absolute Error (MAE): The average of the absolute differences between the predicted and actual values.
4.R-squared (R²): Indicates how well the model explains the variability of the target variable.

In [8]:
#Complete Script
#Here is the complete script in one go:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Sample Data
np.random.seed(0)
data = {
    'Age': np.random.randint(20, 70, size=1000),
    'Gender': np.random.choice(['Male', 'Female'], size=1000),
    'Income': np.random.randint(20000, 80000, size=1000),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=1000),
    'Purchase_Amount': np.random.randint(100, 1000, size=1000)
}

df = pd.DataFrame(data)

# Splitting the data
X = df.drop('Purchase_Amount', axis=1)
y = df['Purchase_Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Defining the preprocessing pipelines for both numeric and categorical data
numeric_features = ['Age', 'Income']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Gender', 'Region']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining the numeric and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Model Validation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)



Mean Squared Error: 78784.0914895
R^2 Score: -0.1642047460942666
