In [21]:
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

%matplotlib inline

In [22]:
# read data
df = pd.read_csv('../data_files/train_data.csv')
df2 = pd.read_csv('../data_files/test_data.csv')

In [23]:
ids = df2['Customer Id']
df.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [24]:
def replace_values(val):
    if val == '>=10':
        return 'greater than 10'
    elif val == '   .':
        return 'no data'
    elif int(val) < 5:
        return 'less than 5'
    else:
        return 'greater than 5'

In [25]:
def preprocess(df, filename):
    # drop ID col 
    df = df.drop('Customer Id', axis=1)
   
    # encode number of windows
    df['NumberOfWindows'] = df['NumberOfWindows'].apply(replace_values)
    
    
    # impute garden based on settlement
    df.loc[df.Settlement == 'U', 'Garden'] = 'V'
    df.loc[df.Settlement != 'U', 'Garden'] = 'O'
    
    
    # impute building dimension
    rural_dim = df[df['Settlement'] == 'R']
    rural_dim_mean = rural_dim["Building Dimension"].mean()
    urban_dim = df[df['Settlement'] == 'U']
    urban_dim_mean = urban_dim["Building Dimension"].mean()
    for i in range(len(df.Settlement)):
        if (df.Settlement[i] == 'U') and pd.isnull(df['Building Dimension'][i]):
            df.at[i, 'Building Dimension'] = urban_dim_mean
        elif (df.Settlement[i] == 'R') and pd.isnull(df['Building Dimension'][i]):
            df.at[i, 'Building Dimension'] = rural_dim_mean
        
        
    #replace occupancy NaN values with mode date
    date_mode = df.Date_of_Occupancy.mode()[0]
    df.Date_of_Occupancy.fillna(date_mode, inplace=True)
    df['Date_of_Occupancy'] = df['Date_of_Occupancy'].astype(int)
    
    
    # Convert the numerical values to string labels for building type
    mapping = {1: 'type1', 2: 'type2', 3: 'type3', 4: 'type4'}
    df['Building_Type'] = df['Building_Type'].map(mapping)
    
    # drop geo code
    df = df.drop('Geo_Code', axis=1)
    
    # create occupancy period 
    df['Occupancy_Period'] = df['YearOfObservation'] - df['Date_of_Occupancy']
    df = df.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)
    
    # save df
    df.to_csv(filename, index=False)

In [26]:
# save and reload processed data
preprocess(df, '../data_files/train_clean2.csv')
preprocess(df2, '../data_files/test_clean2.csv')
df = pd.read_csv('../data_files/train_clean2.csv')
df2 = pd.read_csv('../data_files/test_clean2.csv')

In [27]:
X_train, X_val, y_train, y_val = train_test_split(df.drop('Claim', axis=1), df['Claim'], test_size=0.2, random_state=42)  
X_train

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Occupancy_Period
5266,0.997268,1,N,V,V,U,850.000000,type4,no data,56
5441,1.000000,0,N,V,V,U,2135.000000,type2,no data,55
7025,1.000000,0,V,N,O,R,2700.000000,type3,greater than 5,43
7130,1.000000,1,V,V,V,U,1575.211611,type1,no data,314
4454,1.000000,0,V,N,O,R,1080.000000,type2,greater than 5,53
...,...,...,...,...,...,...,...,...,...,...
3772,1.000000,0,V,N,O,R,750.000000,type2,greater than 5,25
5191,1.000000,1,V,N,O,R,2670.000000,type4,greater than 5,63
5226,1.000000,1,V,N,O,R,2200.000000,type4,greater than 5,110
5390,1.000000,1,N,V,V,U,634.000000,type2,no data,52


In [28]:
cols_to_encode = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Building_Type', 'NumberOfWindows']

# Create a OneHotEncoder object
ohe = OneHotEncoder(sparse=False, drop='first')

# Fit the encoder on the train data and transform both the train and test data
X_train_encoded = ohe.fit_transform(X_train[cols_to_encode])
X_val_encoded = ohe.transform(X_val[cols_to_encode])
df2_encoded = ohe.transform(df2[cols_to_encode])

# Create new column names for the one-hot encoded columns
new_col_names = ohe.get_feature_names(cols_to_encode)

# Replace the original columns with the one-hot encoded columns
X_train[new_col_names] = X_train_encoded
X_val[new_col_names] = X_val_encoded
df2[new_col_names] = df2_encoded

# Drop the original columns
X_train.drop(columns=cols_to_encode, inplace=True)
X_val.drop(columns=cols_to_encode, inplace=True)
df2.drop(columns=cols_to_encode, inplace=True)



In [29]:
# instantiate the scaler
scaler = StandardScaler()

# define columns to scale
cols_to_scale = ['Building Dimension', 'Occupancy_Period']

# fit and transform the training set
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# transform the test set using the fitted scaler from the training set
X_val[cols_to_scale] = scaler.transform(X_val[cols_to_scale])
df2[cols_to_scale] = scaler.transform(df2[cols_to_scale])

X_train

Unnamed: 0,Insured_Period,Residential,Building Dimension,Occupancy_Period,Building_Painted_V,Building_Fenced_V,Garden_V,Settlement_U,Building_Type_type2,Building_Type_type3,Building_Type_type4,NumberOfWindows_greater than 5,NumberOfWindows_less than 5,NumberOfWindows_no data
5266,0.997268,1,-0.452685,0.182678,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
5441,1.000000,0,0.115754,0.154069,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
7025,1.000000,0,0.365690,-0.189241,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7130,1.000000,1,-0.131877,7.563840,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4454,1.000000,0,-0.350941,0.096851,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1.000000,0,-0.496922,-0.704206,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5191,1.000000,1,0.352419,0.382942,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5226,1.000000,1,0.144508,1.727572,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5390,1.000000,1,-0.548236,0.068241,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# Define the MLP model and hyperparameters to tune
mlp = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(32,), (64,), (128,)],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'batch_size': [16, 32, 64]
}

# Perform grid search
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:")
print(grid_search.best_params_)

# Evaluate the model with the best hyperparameters
best_params = grid_search.best_params_
mlp = MLPClassifier(hidden_layer_sizes=best_params['hidden_layer_sizes'],
                    learning_rate_init=best_params['learning_rate_init'],
                    batch_size=best_params['batch_size'])
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_val)
print("Validation accuracy:", accuracy_score(y_val, y_pred))

In [None]:
# Save the model
joblib.dump(mlp, "mlp_model.pkl")

In [None]:
# Load the model and make predictions on new data
new_data = pd.read_csv("new_data.csv") # Replace with your new data file
new_X = new_data.drop(columns=['target']) # Replace 'target' with your target column name
loaded_model = joblib.load("../models/mlp_model.pkl")
new_y_pred = loaded_model.predict(new_X)