In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, mean_absolute_error, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import RidgeClassifier

%matplotlib inline

  import pandas_profiling


In [34]:
# read data
df = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [37]:
ids = df_test['Customer Id']

- Customer ID dropped
- Number of windows dropped
- Garden imputed with V if settlement is urban and O otherwise
- Building dimension of urban and rural settlements is assumed to be diff. 
    - Impute mean val for urban and rural settlements for missing data
- Date of occupancy imputed with mean
- Geo Code col dropped

In [38]:
def preprocess(df, filename):
    # drop ID col 
    df = df.drop('Customer Id', axis=1)

    
    # drop number of windows
    df = df.drop('NumberOfWindows', axis=1)
    
    
    # impute garden based on settlement
    df.loc[df.Settlement == 'U', 'Garden'] = 'V'
    df.loc[df.Settlement != 'U', 'Garden'] = 'O'
    
    
    # impute building dimension
    rural_dim = df[df['Settlement'] == 'R']
    rural_dim_mean = rural_dim["Building Dimension"].mean()
    urban_dim = df[df['Settlement'] == 'U']
    urban_dim_mean = urban_dim["Building Dimension"].mean()
    for i in range(len(df.Settlement)):
        if (df.Settlement[i] == 'U') and pd.isnull(df['Building Dimension'][i]):
            df.at[i, 'Building Dimension'] = urban_dim_mean
        elif (df.Settlement[i] == 'R') and pd.isnull(df['Building Dimension'][i]):
            df.at[i, 'Building Dimension'] = rural_dim_mean
        
        
    #replace occupancy NaN values with mode date
    date_mode = df.Date_of_Occupancy.mode()[0]
    df.Date_of_Occupancy.fillna(date_mode, inplace=True)
    df['Date_of_Occupancy'] = df['Date_of_Occupancy'].astype(int)
    
    
    # Convert the numerical values to string labels for building type
    mapping = {1: 'type1', 2: 'type2', 3: 'type3', 4: 'type4'}
    df['Building_Type'] = df['Building_Type'].map(mapping)
    
    # drop geo code
    df = df.drop('Geo_Code', axis=1)
    
    # save df
    df.to_csv(filename, index=False)

In [39]:
preprocess(df, 'train_clean.csv')
preprocess(df_test, 'test_clean.csv')

In [40]:
df = pd.read_csv('train_clean.csv')
df_test = pd.read_csv('test_clean.csv')

In [41]:
cols_to_encode = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Building_Type']
# Perform one-hot encoding on cols
df_test = pd.get_dummies(df_test, columns=cols_to_encode, prefix=cols_to_encode, drop_first=True)


df_test['Occupancy_Period'] = df_test['YearOfObservation'] - df_test['Date_of_Occupancy']
df_test = df_test.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)


# instantiate the scaler
scaler = StandardScaler()

# define columns to scale
cols_to_scale = ['Insured_Period', 'Building Dimension', 'Occupancy_Period']

# fit and transform the training set
df_test[cols_to_scale] = scaler.fit_transform(df_test[cols_to_scale])
df_test

Unnamed: 0,Insured_Period,Residential,Building Dimension,Building_Painted_V,Building_Fenced_V,Garden_V,Settlement_U,Building_Type_type2,Building_Type_type3,Building_Type_type4,Occupancy_Period
0,0.352438,0,-0.608342,1,0,0,0,0,0,0,0.205468
1,0.339964,0,-0.608342,1,0,0,0,0,0,0,0.340073
2,-2.524508,0,-0.390181,1,1,1,1,0,0,0,0.205468
3,0.352438,0,-0.116367,1,0,0,0,0,0,0,-1.723872
4,0.352438,0,-0.116367,1,0,0,0,0,0,0,-1.634135
...,...,...,...,...,...,...,...,...,...,...,...
3064,0.352438,0,-0.044457,1,1,1,1,0,0,1,2.987306
3065,0.352438,0,-0.044457,1,1,1,1,1,0,0,0.699019
3066,0.352438,0,-0.044457,1,1,1,1,1,0,0,-1.320057
3067,0.352438,0,-0.044457,1,1,1,1,0,0,0,7.384404


In [42]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Claim', axis=1), df['Claim'], test_size=0.2, random_state=42) 

In [43]:
cols_to_encode = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Building_Type']

# Perform one-hot encoding on cols
X_train = pd.get_dummies(X_train, columns=cols_to_encode, prefix=cols_to_encode, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cols_to_encode, prefix=cols_to_encode, drop_first=True)


X_train['Occupancy_Period'] = X_train['YearOfObservation'] - X_train['Date_of_Occupancy']
X_test['Occupancy_Period'] = X_test['YearOfObservation'] - X_test['Date_of_Occupancy']
X_train = X_train.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)
X_test = X_test.drop(columns=['YearOfObservation', 'Date_of_Occupancy'], axis=1)


# instantiate the scaler
scaler = StandardScaler()

# define columns to scale
cols_to_scale = ['Insured_Period', 'Building Dimension', 'Occupancy_Period']

# fit and transform the training set
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# transform the test set using the fitted scaler from the training set
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [45]:
def analyze_results(final_model, X_test, y_pred):
    # Predict the probabilities of the test set
    y_prob = final_model.predict_proba(X_test)[:, 1]

    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot the confusion matrix
    plt.imshow(cm, cmap=plt.cm.Blues)
    plt.colorbar()
    
    # Add the number of values in each cell of the confusion matrix
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j],
                     horizontalalignment='center',
                     verticalalignment='center')
            
    plt.title('Confusion matrix')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.yticks([0, 1], ['Negative', 'Positive'])
    plt.show()

    # Compute the ROC curve and ROC AUC score
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.plot(fpr, tpr, lw=1, label='ROC (AUC = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], '--', color='gray', label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [63]:
def train_xgboost_model(X_train, y_train, X_test, output_file, df_test):
    # Define the XGBoost model
    xgb_model = XGBClassifier()

    # Define the parameter grid to search over
    params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.5]
    }

    # Define the grid search object
    grid_search = GridSearchCV(
        estimator=xgb_model, 
        param_grid=params, 
        scoring='roc_auc', 
        cv=5, 
#         refit='roc_auc', 
        n_jobs=-1
    )

    # Fit the grid search object to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Train an XGBoost model with the best parameters
    xgb_model_best = XGBClassifier(**best_params)
    xgb_model_best.fit(X_train, y_train)

    # Predict the test set
    y_pred = xgb_model_best.predict(X_test)
    submission_pred = xgb_model_best.predict(df_test) 
    results = pd.DataFrame(list(zip(ids, submission_pred)), columns=['Customer Id', 'Claim'])
    results.to_csv(output_file, index=False)
    
    # Print the best parameters and best score
    print('Best parameters: ', best_params)
    print('Best score: ', best_score)
    
    # Print the classification report
    print(classification_report(y_test, y_pred))

    # Print the roc_score report   
    print(f'ROC_AUC_SCORE: {roc_auc_score(y_test, y_pred)}')
    
#     analyze_results(xgb_model_best, X_test, y_pred)

In [49]:
# Instantiate SMOTE
sm = SMOTE(random_state=42)

# Fit and transform the data
X_train_sm_resampled, y_train_sm_resampled = sm.fit_resample(X_train, y_train)

In [53]:
train_xgboost_model(X_train_sm_resampled, y_train_sm_resampled, X_test, 'results.txt')

Best parameters:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best score:  0.8794892449414189
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1098
           1       0.46      0.37      0.41       334

    accuracy                           0.75      1432
   macro avg       0.64      0.62      0.63      1432
weighted avg       0.74      0.75      0.74      1432

ROC_AUC_SCORE: 0.619013339441336
Submission Results: 
 [1 0 0 ... 1 1 1]


In [55]:
# perform LassoCV to find the optimal alpha
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

# extract the coefficients of the non-zero features
coef = pd.Series(lasso.coef_, index=X_train.columns)
selected_features = coef[coef != 0].index.tolist()

In [56]:
X_train_lasso, X_test_lasso = X_train[selected_features], X_test[selected_features]

In [65]:
train_xgboost_model(X_train_lasso, y_train, X_test_lasso, 'lasso_results.txt', df_test[selected_features])

Best parameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best score:  0.7170229005349524
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      1098
           1       0.62      0.17      0.27       334

    accuracy                           0.78      1432
   macro avg       0.71      0.57      0.57      1432
weighted avg       0.75      0.78      0.73      1432

ROC_AUC_SCORE: 0.5704329046824383
