In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import Markdown
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

raw_data = pd.read_csv('customer.csv')
masked_df = pd.read_csv('customer_test_masked.csv')

In [2]:
label_features = ['state_of_res']
categorical_features = ['sex', 'marital_status', 'housing_type', 'gas_category', 'recent_move_b', 'is_employed']
numerical_features = ['income', 'num_vehicles', 'age', 'gas_usage', 'rooms', 'age_income']
target_feature = 'health_ins'
id_feature = 'custid'

# Define a preprocessor globally so it can be reused
preprocessor = ColumnTransformer(
    transformers=[
        ('label', OrdinalEncoder(), label_features),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

def preprocess_dataframe(df, fit_preprocessor=True):
    global preprocessor  # Use the same preprocessor instance
    
    # Drop the target feature for feature processing
    df_features = df.drop(target_feature, axis=1, errors='ignore')
    
    if fit_preprocessor:
        # Fit and transform the preprocessor on training data
        features_processed = preprocessor.fit_transform(df_features)
    else:
        # Only transform using an already fitted preprocessor
        features_processed = preprocessor.transform(df_features)
    
    # Extract feature names from the preprocessor
    feature_names = (
        label_features +
        preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features).tolist() +
        numerical_features +
        [id_feature]
    )
    
    # Create a DataFrame for the processed features
    df_processed = pd.DataFrame(features_processed, columns=feature_names)
    
    # Ensure indices align
    df_processed = df_processed.reset_index(drop=True)
    
    # Add the target feature back if it exists in the original DataFrame
    if target_feature in df.columns:
        df_target = df[target_feature].reset_index(drop=True)
        df_processed[target_feature] = df_target
    
    return df_processed

In [3]:
def smote_data(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [4]:
def classify_sample_then_split(df, sample_function, params, classifier):
    # Preprocess data
    preprocessed_df = preprocess_dataframe(df)

    X = preprocessed_df.drop(target_feature, axis=1)  
    y = preprocessed_df[target_feature]
        
    # Sample data
    X_resampled, y_resampled = sample_function(X, y)
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
    
    # Train model with GridSearchCV
    grid_search = GridSearchCV(estimator=classifier, param_grid=params, cv=5)
    
    grid_search.fit(X_train, y_train)
    
    # Predict
    y_pred = grid_search.predict(X_test)
    
    # Evaluate
    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Best Parameters: {grid_search.best_params_}')
    print(classification_report(y_test, y_pred))

In [5]:
def handle_gas_feature(df):
    
    df_copy = df.copy()
    
    gas_median = df_copy[df_copy['gas_usage'] > 3].gas_usage.median()

    non_bill_cases_values = {
        1: gas_median,
        2: gas_median,
        3: 0
    }
    non_bill_cases = {
        1: 'Included in rent', 
        2: 'Included in electricity', 
        3: 'No charge'
    }

    # New feature for gas usage category
    df_copy['gas_category'] = df_copy['gas_usage'].replace(non_bill_cases).where(
        df_copy['gas_usage'].isin(non_bill_cases.keys()), 'Actual Bill'
    )
    df_copy.loc[pd.isna(df_copy['gas_usage']), 'gas_category'] = 'Unknown'

    # Replace non-bill cases' values or maintain if not included
    df_copy['gas_usage'] = df_copy['gas_usage'].replace(non_bill_cases_values)
    df_copy['gas_usage'] = df_copy['gas_usage'].fillna(gas_median)
    
    return df_copy

In [6]:
def apply_data_engineering(df):
    
    df_filtered = df.copy()
    
    df_filtered.drop(columns=['Unnamed: 0', 'code_column'], inplace=True)

    rows_with_missing = df_filtered.isnull().sum(axis=1) > 2
    df_filtered.drop(df_filtered[rows_with_missing].index, inplace=True)
    
    df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
    
    df_filtered['num_vehicles'] = df_filtered['num_vehicles'].fillna(round(df['num_vehicles'].mean(),0))
    
    df_filtered['recent_move_b'] = df_filtered['recent_move_b'].fillna(df_filtered['recent_move_b'].mode()[0])
    df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})
    
    df_filtered['age'] = df_filtered['age'].replace(0, df_filtered['age'].median())
    rows_with_age_120 = df_filtered['age'] == 120
    df_filtered = df_filtered.drop(df_filtered[rows_with_age_120].index)
    
    df_filtered = handle_gas_feature(df_filtered)
    
    df_filtered['age_income'] = df_filtered.age * df_filtered.income
    
    df_missing = df[rows_with_missing | rows_with_age_120]
    
    return df_filtered, df_missing

In [7]:
def apply_data_engineering_replace(df):
    
    df_replaced = df.copy()
    
    df_replaced.drop(columns=['Unnamed: 0', 'code_column'], inplace=True)

    df_replaced['housing_type'] = df_replaced['housing_type'].fillna(df_replaced['housing_type'].mode()[0])
    
    df_replaced['is_employed'] = df_replaced['is_employed'].fillna(False)
    
    df_replaced['num_vehicles'] = df_replaced['num_vehicles'].fillna(round(df['num_vehicles'].mean(),0))
    
    df_replaced['recent_move_b'] = df_replaced['recent_move_b'].fillna(df_replaced['recent_move_b'].mode()[0])
    df_replaced['recent_move_b'] = df_replaced['recent_move_b'].replace({'T': True, 'F': False})
    
    df_replaced['age'] = df_replaced['age'].replace(0, df_replaced['age'].median())
    df_replaced['age'] = df_replaced['age'].replace(120, df_replaced['age'].median())
    
    df_replaced = handle_gas_feature(df_replaced)
    
    df_replaced['age_income'] = df_replaced.age * df_replaced.income
    
    return df_replaced

# SUBMISSION

## XGBoost

In [168]:
df_filtered, df_missing = apply_data_engineering(raw_data)
sub_filtered, sub_missing = apply_data_engineering(masked_df)

  df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
  df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})
  df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
  df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})


In [169]:
def convert_feature_types(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col in categorical_features or col in label_features:
            df[col] = df[col].astype('category')
        elif col in numerical_features:
            df[col] = df[col].astype('float')

In [170]:
xgb_train_df = pd.concat([df_filtered, df_missing], axis=0, ignore_index=True)

xgb_train_df.drop(columns=['custid'], inplace=True)

convert_feature_types(xgb_train_df)

In [171]:
#### THIS CODE IS FOR THE XGBOOST MODEL ####
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame and 'target' is the column you want to predict
X = xgb_train_df.drop(target_feature, axis=1)
y = xgb_train_df[target_feature]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

params = {
    'objective': 'binary:logistic',  # For binary classification
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'logloss'
}

num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

preds = bst.predict(dtest)
predictions = [1 if pred > 0.5 else 0 for pred in preds]

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9049130554788849


In [172]:
X_train = xgb_train_df.drop(columns=target_feature)
y_train = xgb_train_df[target_feature]

masked_df = pd.concat([sub_filtered, sub_missing], axis=0, ignore_index=True)

X_submission = masked_df.drop(columns=['custid', 'health_ins'])

convert_feature_types(X_submission)

X_sub_matrix = xgb.DMatrix(X_submission, enable_categorical=True)

y_pred = bst.predict(X_sub_matrix)

# Compute the median of the predictions
median_threshold = np.median(y_pred)

print("median_threshold: ", median_threshold)

# Create a DataFrame for the predictions and the custid
xgb_submission_df = pd.DataFrame({
    'custid': masked_df['custid'],  # Retrieve the 'custid' column from the original DataFrame
    'health_ins': y_pred           # Add the predicted values
})

# Convert the numeric values in 'predicted_y' to 'TRUE' or 'FALSE' based on the median threshold
xgb_submission_df['health_ins'] = xgb_submission_df['health_ins'].apply(lambda x: 'TRUE' if x > median_threshold else 'FALSE')

# Store the DataFrame without column names
xgb_submission_df.to_csv('xgb_submission.csv', index=False)


median_threshold:  0.9317484


## Random Forest

In [151]:
rf_train_data = apply_data_engineering_replace(raw_data)
rf_submission_data = apply_data_engineering_replace(masked_df)

rf_train_data = preprocess_dataframe(rf_train_data, fit_preprocessor=True)
rf_submission_data = preprocess_dataframe(rf_submission_data, fit_preprocessor=False)
rf_submission_data['health_ins'] = rf_submission_data['health_ins'].astype(bool)

  df_replaced['is_employed'] = df_replaced['is_employed'].fillna(False)
  df_replaced['recent_move_b'] = df_replaced['recent_move_b'].replace({'T': True, 'F': False})
  df_replaced['is_employed'] = df_replaced['is_employed'].fillna(False)
  df_replaced['recent_move_b'] = df_replaced['recent_move_b'].fillna(df_replaced['recent_move_b'].mode()[0])


In [160]:
# separate features
X = rf_train_data.drop([target_feature, id_feature], axis=1)
y = rf_train_data[target_feature]

# define model random forest model following the best parameters {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'n_estimators': 300}
rf_model = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=20, n_estimators=300)

# resample data
X_resampled, y_resampled = smote_data(X, y)

# fit model
rf_model.fit(X_resampled, y_resampled)

rf_submission_features = rf_submission_data.drop(columns=[target_feature, id_feature])

# predict
y_pred = rf_model.predict(rf_submission_features)

# Create a DataFrame for the predictions and the custid
rf_submission_df = pd.DataFrame({
    'custid': rf_submission_data['custid'],  # Retrieve the 'custid' column from the original DataFrame
    'health_ins': y_pred           # Add the predicted values
})

# Store the DataFrame without column names
rf_submission_df.to_csv('rf_submission.csv', index=False)



In [158]:
# BALANCED PREDICTIONS (50-50 split)

# Predict probabilities (if available) for more nuanced control
y_proba = rf_model.predict_proba(rf_submission_features)[:, 1]  # Probabilities for the positive class (True)

# Sort indices by predicted probability for a deterministic split
sorted_indices = np.argsort(y_proba)

# Determine the split point
half_count = len(y_proba) // 2

# Initialize an array of predictions
y_pred_balanced = np.zeros_like(y_proba, dtype=bool)

# Set the top half to True and the bottom half to False
y_pred_balanced[sorted_indices[:half_count]] = False
y_pred_balanced[sorted_indices[half_count:]] = True

# Create a DataFrame for the balanced predictions
rf_submission_balanced_df = pd.DataFrame({
    'custid': rf_submission_data['custid'],  # Retrieve the 'custid' column
    'health_ins': y_pred_balanced             # Balanced predictions
})

# Store the DataFrame without column names
rf_submission_balanced_df.to_csv('rf_submission_balanced.csv', index=False)


## Decision Tree

In [8]:
dt_train_data = apply_data_engineering_replace(raw_data)
dt_submission_data = apply_data_engineering_replace(masked_df)

dt_train_data = preprocess_dataframe(dt_train_data, fit_preprocessor=True)
dt_submission_data = preprocess_dataframe(dt_submission_data, fit_preprocessor=False)
dt_submission_data['health_ins'] = dt_submission_data['health_ins'].astype(bool)

  df_replaced['is_employed'] = df_replaced['is_employed'].fillna(False)
  df_replaced['recent_move_b'] = df_replaced['recent_move_b'].replace({'T': True, 'F': False})
  df_replaced['is_employed'] = df_replaced['is_employed'].fillna(False)
  df_replaced['recent_move_b'] = df_replaced['recent_move_b'].replace({'T': True, 'F': False})


In [9]:
# separate features
X = dt_train_data.drop([target_feature, id_feature], axis=1)
y = dt_train_data[target_feature]

# define model random forest model following the best parameters {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 29, 'splitter': 'best'}
dt_model = DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=29, splitter='best')

# resample data
X_resampled, y_resampled = smote_data(X, y)

# fit model
dt_model.fit(X_resampled, y_resampled)

dt_submission_features = dt_submission_data.drop(columns=[target_feature, id_feature])

# predict
y_pred = dt_model.predict(dt_submission_features)

# Create a DataFrame for the predictions and the custid
dt_submission_df = pd.DataFrame({
    'custid': dt_submission_data['custid'],  # Retrieve the 'custid' column from the original DataFrame
    'health_ins': y_pred           # Add the predicted values
})

# Store the DataFrame without column names
dt_submission_df.to_csv('dt_submission.csv', index=False)

In [10]:
# BALANCED PREDICTIONS (50-50 split)

# Predict probabilities (if available) for more nuanced control
y_proba = dt_model.predict_proba(dt_submission_features)[:, 1]  # Probabilities for the positive class (True)

# Sort indices by predicted probability for a deterministic split
sorted_indices = np.argsort(y_proba)

# Determine the split point
half_count = len(y_proba) // 2

# Initialize an array of predictions
y_pred_balanced = np.zeros_like(y_proba, dtype=bool)

# Set the top half to True and the bottom half to False
y_pred_balanced[sorted_indices[:half_count]] = False
y_pred_balanced[sorted_indices[half_count:]] = True

# Create a DataFrame for the balanced predictions
dt_submission_balanced_df = pd.DataFrame({
    'custid': dt_submission_data['custid'],  # Retrieve the 'custid' column
    'health_ins': y_pred_balanced             # Balanced predictions
})

# Store the DataFrame without column names
dt_submission_balanced_df.to_csv('dt_submission_balanced.csv', index=False)