In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/TobyyR/lab-customer-analysis-round-2/master/files_for_lab/csv_files/marketing_customer_analysis.csv")
df

In [None]:
df.shape

In [None]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df

In [None]:
df = df.drop(columns =["unnamed:_0"], axis = 0)
df

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df._get_numeric_data()

In [None]:
df.select_dtypes(include=['object']).columns.tolist()

In [None]:
df.isna().sum()

In [None]:
dfs = df[df["state"].isna()]
dfs

In [None]:
dfm = df[df["months_since_last_claim"].isna()]
dfm

In [None]:
dfn = df[df["number_of_open_complaints"].isna()]
dfn

In [None]:
dfvc = df[df["vehicle_class"].isna()]
dfvc

In [None]:
dfvs = df[df["vehicle_size"].isna()]
dfvs

In [None]:
dfvt = df[df["vehicle_type"].isna()]
dfvt

In [None]:
df = df.dropna(subset = ["vehicle_type", "vehicle_size", "vehicle_class", "number_of_open_complaints", "months_since_last_claim", "response", "state"])
df

In [None]:
df["effective_to_date"] = pd.to_datetime(df["effective_to_date"], errors = "coerce")
df

In [None]:
df['month'] = pd.to_datetime(df['effective_to_date']).dt.month

filtered_data = df[(df['effective_to_date'].dt.month >= 1) & (df['effective_to_date'].dt.month <= 3)]

#if filtered_data.empty:
#    filtered_data = df[(df['effective_to_date'].dt.month >= 1) & (df['effective_to_date'].dt.month <= 2)]

filtered_data


In [None]:
df.month.value_counts(ascending=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
response_counts = df['response'].value_counts()
response_counts.plot(kind='bar', color=['skyblue', 'orange'])
plt.title('total Number of Responses')
plt.xlabel('Response')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x = 'sales_channel', hue = 'response', data = df)
plt.title('Response Rate by Sales Channel')
plt.xlabel('Sales Channel')
plt.ylabel('Count')
plt.legend(title='Response')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
response_by_channel = df.groupby('sales_channel')['response'].value_counts(normalize=True) * 100
response_by_channel.plot(kind='bar', color=['blue', 'orange'])
plt.title('Response Rate by Sales Channel')
plt.xlabel('Sales Channel')
plt.ylabel('Percentage')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='response', y='total_claim_amount', data=df)
plt.title('Response Rate by Total Claim Amount')
plt.xlabel('Response')
plt.ylabel('Total Claim Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='response', y='income', data=df)
plt.title('Response Rate by Income')
plt.xlabel('Response')
plt.ylabel('Income')
plt.show()

In [None]:
## Lab customer analysis round 4

In [None]:
# Check data types of columns
numerical = df.select_dtypes(include=[np.number]).columns.tolist()
categoricals = df.select_dtypes(include=[object]).columns.tolist()

print("\nNumerical Columns:", numerical)
print("\nCategorical Columns:", categoricals)

In [None]:
plt.figure(figsize=(12, 8))
for i, column in enumerate(numerical):
    plt.subplot(3, 3, i + 1)
    sns.histplot(df[column], kde=True)
    plt.title(column)
plt.tight_layout()
plt.show()

In [None]:
numerical = df.select_dtypes(include=[np.number])
categoricals = df.select_dtypes(include=[object])

correlation_matrix = numerical.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title("Correlation Matrix of Numerical Features")
plt.show()

In [None]:
high_corr_pairs = [(i, j) for i in range(correlation_matrix.shape[0]) for j in range(i+1, correlation_matrix.shape[0]) if abs(correlation_matrix.iloc[i, j]) > 0.9]
high_corr_pairs

In [None]:
if high_corr_pairs:
    # Drop one of the features from each highly correlated pair
    for pair in high_corr_pairs:
        feature1 = numerical.columns[pair[0]]
        feature2 = numerical.columns[pair[1]]
        print(f"Features '{feature1}' and '{feature2}' have high correlation of {correlation_matrix.iloc[pair]}")

        # Drop the feature with the least importance
        # Here, you can implement your logic to decide which feature to drop
        # For demonstration, let's drop the feature with the lower mean correlation with other variables
        mean_corr_feature1 = correlation_matrix[feature1].drop(feature1).abs().mean()
        mean_corr_feature2 = correlation_matrix[feature2].drop(feature2).abs().mean()

        if mean_corr_feature1 < mean_corr_feature2:
            numerical.drop(columns=[feature1], inplace=True)
            print(f"Dropping '{feature1}'")
        else:
            numerical.drop(columns=[feature2], inplace=True)
            print(f"Dropping '{feature2}'")

    print("\nUpdated Numerical DataFrame after dropping highly correlated features:")
    print(numerical.head())
else:
    print("No pair of features have high correlation (> 0.9), so no features are dropped.")

In [None]:
## Lab customer analysis round 5 + 6 + 7 
df

In [35]:
# X-y split
X = df.drop(columns=['total_claim_amount'])  # Features
y = df['total_claim_amount']  # Target

# Normalize numerical features
scaler = StandardScaler()
X_numerical_normalized = scaler.fit_transform(X.select_dtypes(include=np.number))

# One Hot Encoding for categorical features
X_categorical_encoded = pd.get_dummies(X.select_dtypes(include=object), drop_first=True)

# Concatenate numerical and encoded categorical features
X_processed = np.concatenate([X_numerical_normalized, X_categorical_encoded], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Apply linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_train_lr = lr_model.predict(X_train)
y_pred_test_lr = lr_model.predict(X_test)

# Model Validation for Linear Regression
# Train R2
r2_train_lr = r2_score(y_train, y_pred_train_lr)
# Test R2
r2_test_lr = r2_score(y_test, y_pred_test_lr)
# Train MSE
mse_train_lr = mean_squared_error(y_train, y_pred_train_lr)
# Test MSE
mse_test_lr = mean_squared_error(y_test, y_pred_test_lr)
# Train RMSE
rmse_train_lr = np.sqrt(mse_train_lr)
# Test RMSE
rmse_test_lr = np.sqrt(mse_test_lr)
# Train MAE
mae_train_lr = mean_absolute_error(y_train, y_pred_train_lr)
# Test MAE
mae_test_lr = mean_absolute_error(y_test, y_pred_test_lr)

print("Linear Regression Model Validation Results:")
print("Train R2:", r2_train_lr)
print("Test R2:", r2_test_lr)
print("Train MSE:", mse_train_lr)
print("Test MSE:", mse_test_lr)
print("Train RMSE:", rmse_train_lr)
print("Test RMSE:", rmse_test_lr)
print("Train MAE:", mae_train_lr)
print("Test MAE:", mae_test_lr)

from sklearn.preprocessing import MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import RFE

# Feature Scaling
scaler_minmax = MinMaxScaler()
X_numerical_minmax = scaler_minmax.fit_transform(X.select_dtypes(include=np.number))

scaler_robust = RobustScaler()
X_numerical_robust = scaler_robust.fit_transform(X.select_dtypes(include=np.number))


# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_numerical_minmax)  # Using MinMax scaled numerical features


# Feature Selection using RFE
lr_model_rfe = LinearRegression()
rfe = RFE(lr_model_rfe, n_features_to_select=10)  # Select top 10 features
X_rfe = rfe.fit_transform(X_processed, y)

# Train-test split for each modified dataset
X_train_minmax, X_test_minmax, _, _ = train_test_split(X_numerical_minmax, y, test_size=0.2, random_state=42)
X_train_robust, X_test_robust, _, _ = train_test_split(X_numerical_robust, y, test_size=0.2, random_state=42)
X_train_poly, X_test_poly, _, _ = train_test_split(X_poly, y, test_size=0.2, random_state=42)
X_train_rfe, X_test_rfe, _, _ = train_test_split(X_rfe, y, test_size=0.2, random_state=42)

# Train models and evaluate them
lr_models = {
    "Original": lr_model,
    "MinMax Scaling": LinearRegression().fit(X_train_minmax, y_train),
    "Robust Scaling": LinearRegression().fit(X_train_robust, y_train),
    "Polynomial Features": LinearRegression().fit(X_train_poly, y_train),
    "Feature Selection (RFE)": LinearRegression().fit(X_train_rfe, y_train)
}

# Evaluate models
evaluation_results = {}
for name, model in lr_models.items():
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    evaluation_results[name] = {"Train R^2": train_score, "Test R^2": test_score}

evaluation_results