In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LassoCV
import category_encoders as ce
from sklearn.metrics import mean_absolute_error, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
%matplotlib inline

  import pandas_profiling


# Phase 1: Data Cleaning + EDA

In [None]:
df = pd.read_csv('train_data.csv')
df.head()

In [None]:
df = df.drop('Customer Id', axis=1)

# replace . in NumberOfWindows with NaN
df['NumberOfWindows'] = df['NumberOfWindows'].replace('   .', np.nan)

In [None]:
rural_settlement = df.loc[df.Settlement == 'R']
rural_settlement.loc[rural_settlement.Garden == 'O']

In [None]:
df_profile = pandas_profiling.ProfileReport(df)
df_profile

In [None]:
def plot_feature_vs_target(feature):
    # YearOfObservation relation to number of claims
    # Group the data by year and claim
    grouped = df.groupby([feature, 'Claim']).size().reset_index(name='count')

    # Pivot the data to create separate columns for 0 and 1 claims
    pivoted = grouped.pivot(index=feature, columns='Claim', values='count')

    # Plot a stacked bar chart
    pivoted.plot(kind='bar', stacked=False)

    # Set the plot title and axis labels
    plt.title(f'Number of Claims by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')


    # Show the plot
    plt.show()

In [None]:
plot_feature_vs_target('YearOfObservation')

In [None]:
plot_feature_vs_target('Settlement')

In [None]:
plot_feature_vs_target('Building_Type')

In [None]:
plot_feature_vs_target('NumberOfWindows')

In [None]:
df.info()

## Insights
1. All Urban settlements seem to have gardens
2. Only 1 instance of rural settlement has a garden
3. 99.6% of painted houses are in urban settlements
4. All instances where Settlement is Urban has the number of windows reporting an NaN
5. Target data is imbalanced

In [None]:
#investigate relation between year of observation and target
#investigate relation between date of occupancy and target

## NaN Values

### Number of windows

In [None]:
# Number of windows imputation
# highly correlated with Settlement & Garden
df.NumberOfWindows.unique()

In [None]:
df.loc[(df.Settlement == 'U') & (df.NumberOfWindows.isna())]

In [None]:
df = df.drop('NumberOfWindows', axis=1)

### Garden

In [None]:
print(df.Garden.unique())
df.loc[df.Garden.isnull()]

In [None]:
df.loc[df.Settlement == 'U', 'Garden'] = 'V'
df.loc[df.Settlement != 'U', 'Garden'] = 'O'
df.head()

### Building Dimension

In [None]:
df[df['Building Dimension'].isna()]

In [None]:
df.groupby('Settlement')['Building Dimension'].mean().reset_index()

In [None]:
rural_dim = df[df['Settlement'] == 'R']
rural_dim_mean = rural_dim["Building Dimension"].mean()
urban_dim = df[df['Settlement'] == 'U']
urban_dim_mean = urban_dim["Building Dimension"].mean()

Replace NaN of Building Settlement with mean of rural_dim_mean if its Settlement type is Rural 'R'.  
Replace NaN of Building Settlement with mean of urban_dim_mean if its Settlement type is Urban 'U'

In [None]:
for i in range(len(df.Settlement)):
    if (df.Settlement[i] == 'U') and pd.isnull(df['Building Dimension'][i]):
        df.at[i, 'Building Dimension'] = urban_dim_mean
    elif (df.Settlement[i] == 'R') and pd.isnull(df['Building Dimension'][i]):
        df.at[i, 'Building Dimension'] = rural_dim_mean

### Date of Occupancy

In [None]:
#replace NaN values with mode date
date_mode = df.Date_of_Occupancy.mode()[0]
df.Date_of_Occupancy.fillna(date_mode, inplace=True)

df['Date_of_Occupancy'] = df['Date_of_Occupancy'].astype(int)

### Geo Code

In [None]:
df.groupby('Settlement')['Geo_Code'].apply(lambda x: x.mode()[0])

In [None]:
rural_dim = df[df['Settlement'] == 'R']
rural_dim_mean = rural_dim["Geo_Code"].mode()[0]
urban_dim = df[df['Settlement'] == 'U']
urban_dim_mean = urban_dim["Geo_Code"].mode()[0]

for i in range(len(df.Settlement)):
    if (df.Settlement[i] == 'U') and pd.isnull(df['Geo_Code'][i]):
        df.at[i, 'Geo_Code'] = urban_dim_mean
    elif (df.Settlement[i] == 'R') and pd.isnull(df['Geo_Code'][i]):
        df.at[i, 'Geo_Code'] = rural_dim_mean

- Geo code has high cardinality  
- One possible way to resolve this would be to engineer the locations/state of the codes into a diff col 
- For now Geo_Code is being dropped


In [None]:
# drop Geo_Code
df = df.drop('Geo_Code', axis=1)

In [None]:
df_profile = pandas_profiling.ProfileReport(df)
df_profile

In [None]:
# save df to train_clean.csv
df.to_csv('train_clean.csv', index=False)

# Phase 2: Feature Extraction & Engineering

In [None]:
# Convert the numerical values to string labels for building type
mapping = {1: 'type1', 2: 'type2', 3: 'type3', 4: 'type4'}
df['Building_Type'] = df['Building_Type'].map(mapping)
df.head()

### Train Test Split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Claim', axis=1), df['Claim'], test_size=0.2, random_state=42)  
X_train

### Encode Categorical Cols

In [None]:
cols_to_encode = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Building_Type']

# Perform one-hot encoding on cols
X_train = pd.get_dummies(X_train, columns=cols_to_encode, prefix=cols_to_encode)
X_test = pd.get_dummies(X_test, columns=cols_to_encode, prefix=cols_to_encode)
X_train


### Baseline Model

In [None]:
def train_model(X_train, y_train):
    # Define the parameter grid to search over
    param_grid = {'n_estimators': [50, 100, 200],
                'max_depth': [3, 5, 10, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None]}

    # Create a RandomForestClassifier
    rfc = RandomForestClassifier(random_state=42)

    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='roc_auc')

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # Use the best hyperparameters to create a final model
    final_model = RandomForestClassifier(**grid_search.best_params_, random_state=42)

    # Fit the final model to the data
    final_model.fit(X_train, y_train)

    # Predict the labels of the test set
    y_pred = final_model.predict(X_test)

    # Print the classification report
    print(classification_report(y_test, y_pred))
    

In [None]:
train_model(X_train, y_train)

### Create Building Occupancy Period (Years) Col

Occupancy period represents how long people have been staying in a building from the first recorded occupancy to the year in which the insurance policy was created  
Subtract Date_of_occupancy from YearOfObservation for new col  
The assumption made here is that each insurance made here is for a unique house and the YearOfObservation was the very first time the building was insured

In [None]:
X_train['Occupancy_Period'] = X_train['YearOfObservation'] - X_train['Date_of_Occupancy']
X_test['Occupancy_Period'] = X_test['YearOfObservation'] - X_test['Date_of_Occupancy']
X_train = X_train.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)
X_test = X_test.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)
X_train

### Scale numerical features

In [None]:
# instantiate the scaler
scaler = StandardScaler()

# define columns to scale
cols_to_scale = ['Insured_Period', 'Building Dimension', 'Occupancy_Period']

# fit and transform the training set
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# transform the test set using the fitted scaler from the training set
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
X_train

In [None]:
train_model(X_train, y_train)

### Fixing Data Imbalance

#### Smote

In [None]:
# Instantiate SMOTE
sm = SMOTE(random_state=42)

# Fit and transform the data
X_train_sm_resampled, y_train_sm_resampled = sm.fit_resample(X_train, y_train)

In [None]:
train_model(X_train_sm_resampled, y_train_sm_resampled)

#### Class Weight Balancing

In [None]:
# Get unique class labels in target variable
classes = np.unique(y_train)

# Compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=classes, y=y_train)

# Create a dictionary with class weights
class_weights_dict = dict(enumerate(class_weights))

# Define the parameter grid to search over
param_grid = {'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]}

# Create a RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='roc_auc')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Use the best hyperparameters to create a final model
final_model = RandomForestClassifier(**grid_search.best_params_, random_state=42)

# Fit the final model to the data
final_model.fit(X_train, y_train, class_weight=class_weights_dict)

# Predict the labels of the test set
y_pred = final_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

### Utilizing LASSO for feature Extraction

In [None]:
# # perform LassoCV to find the optimal alpha
# lasso = LassoCV(cv=5, random_state=42)
# lasso.fit(X_train, y_train)

# # extract the coefficients of the non-zero features
# coef = pd.Series(lasso.coef_, index=X_train.columns)
# selected_features = coef[coef != 0].index.tolist()
# selected_features

In [None]:
# X_train, X_test = X_train[selected_features], X_test[selected_features]

#### Tree Based Method for Feature Extraction

In [None]:
# # Initialize the random forest classifier with default parameters
# rfc = RandomForestClassifier()

# # Fit the random forest classifier on the training data
# rfc.fit(X_train, y_train)

# # Get feature importances
# importances = rfc.feature_importances_

# # Sort feature importances in descending order
# indices = np.argsort(importances)[::-1]

# # Print the feature ranking
# print("Feature ranking:")

# for f in range(X_train.shape[1]):
#     print("%d. %s (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))