In [83]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import Markdown
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

raw_data = pd.read_csv('customer.csv')
masked_df = pd.read_csv('customer_test_masked.csv')

In [84]:
def handle_gas_feature(df):
    gas_median = df[df['gas_usage'] > 3].gas_usage.median()

    non_bill_cases_values = {
        1: gas_median,
        2: gas_median,
        3: 0
    }
    non_bill_cases = {
        1: 'Included in rent', 
        2: 'Included in electricity', 
        3: 'No charge'
    }

    # New feature for gas usage category
    df['gas_category'] = df['gas_usage'].replace(non_bill_cases).where(
        df['gas_usage'].isin(non_bill_cases.keys()), 'Actual Bill'
    )
    df.loc[pd.isna(df['gas_usage']), 'gas_category'] = 'Unknown'

    # Replace non-bill cases' values or maintain if not included
    df['gas_usage'] = df['gas_usage'].replace(non_bill_cases_values)
    df['gas_usage'] = df['gas_usage'].fillna(gas_median)

In [85]:
label_features = ['state_of_res']
categorical_features = ['sex','marital_status', 'housing_type', 'gas_category', 'recent_move_b', 'is_employed'] 
numerical_features = ['income', 'num_vehicles', 'age', 'gas_usage', 'rooms' ,'age_income']  
target_feature = 'health_ins'

def preprocess_dataframe(df):
    
    df_features = df.drop(target_feature, axis=1)
    
    preprocessor = ColumnTransformer(
    transformers=[
            ('label', OrdinalEncoder(), label_features),
            ('onehot', OneHotEncoder(drop='first'), categorical_features),
            ('scaler', StandardScaler(), numerical_features)        
        ],
        remainder='passthrough'
    )
    
    features_processed = preprocessor.fit_transform(df_features)
    
    feature_names = label_features + preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features).tolist() + numerical_features
    
    df_processed = pd.DataFrame(features_processed, columns=feature_names)

    # Ensure indices align
    df_processed = df_processed.reset_index(drop=True)
    df_target = df[target_feature].reset_index(drop=True)
    
    df_processed[target_feature] = df_target
    
    return df_processed

In [86]:
def smote_data(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [87]:
def classify_sample_then_split(df, sample_function, params, classifier):
    # Preprocess data
    preprocessed_df = preprocess_dataframe(df)

    X = preprocessed_df.drop(target_feature, axis=1)  
    y = preprocessed_df[target_feature]
        
    # Sample data
    X_resampled, y_resampled = sample_function(X, y)
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
    
    # Train model with GridSearchCV
    grid_search = GridSearchCV(estimator=classifier, param_grid=params, cv=5)
    
    grid_search.fit(X_train, y_train)
    
    # Predict
    y_pred = grid_search.predict(X_test)
    
    # Evaluate
    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Best Parameters: {grid_search.best_params_}')
    print(classification_report(y_test, y_pred))

In [88]:
def apply_data_engineering(df):
    
    df.drop(columns=['Unnamed: 0', 'code_column'], inplace=True)
    df_filtered = df.copy()

    rows_with_missing = df_filtered.isnull().sum(axis=1) > 2
    df_filtered.drop(df_filtered[rows_with_missing].index, inplace=True)
    
    df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
    
    df_filtered['num_vehicles'] = df_filtered['num_vehicles'].fillna(round(df['num_vehicles'].mean(),0))
    
    df_filtered['recent_move_b'] = df_filtered['recent_move_b'].fillna(df_filtered['recent_move_b'].mode()[0])
    df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})
    
    df_filtered['age'] = df_filtered['age'].replace(0, df_filtered['age'].median())
    rows_with_age_120 = df_filtered['age'] == 120
    df_filtered = df_filtered.drop(df_filtered[rows_with_age_120].index)
    
    #df_filtered['health_ins'] = df_filtered['health_ins'].replace({True: 1, False: 0})
    
    handle_gas_feature(df_filtered)
    
    df_filtered['age_income'] = df_filtered.age * df_filtered.income
    
    df_missing = df[rows_with_missing | rows_with_age_120]
    
    return df_filtered, df_missing

In [89]:
df_filtered, df_missing = apply_data_engineering(raw_data)
sub_filtered, sub_missing = apply_data_engineering(masked_df)

  df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
  df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})
  df_filtered['is_employed'] = df_filtered['is_employed'].fillna(False)
  df_filtered['recent_move_b'] = df_filtered['recent_move_b'].replace({'T': True, 'F': False})


In [90]:
len(sub_missing)

34

In [91]:
#classify_sample_then_split(train_df, smote_data, {'n_neighbors': [3]}, KNeighborsClassifier())

In [92]:
def convert_feature_types(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col in categorical_features or col in label_features:
            df[col] = df[col].astype('category')
        elif col in numerical_features:
            df[col] = df[col].astype('float')

In [93]:
xgb_train_df = pd.concat([df_filtered, df_missing], axis=0, ignore_index=True)

xgb_train_df.drop(columns=['custid'], inplace=True)

convert_feature_types(xgb_train_df)

In [94]:
#### THIS CODE IS FOR THE XGBOOST MODEL ####
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame and 'target' is the column you want to predict
X = xgb_train_df.drop(target_feature, axis=1)
y = xgb_train_df[target_feature]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

params = {
    'objective': 'binary:logistic',  # For binary classification
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'logloss'
}

num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

preds = bst.predict(dtest)
predictions = [1 if pred > 0.5 else 0 for pred in preds]

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9050050602631337


# SUBMISSION

In [95]:
X_train = xgb_train_df.drop(columns=target_feature)
y_train = xgb_train_df[target_feature]

masked_df = pd.concat([sub_filtered, sub_missing], axis=0, ignore_index=True)

X_submission = masked_df.drop(columns=['custid', 'health_ins'])

convert_feature_types(X_submission)

X_sub_matrix = xgb.DMatrix(X_submission, enable_categorical=True)

y_pred = bst.predict(X_sub_matrix)

# Compute the median of the predictions
median_threshold = np.median(y_pred)

print("median_threshold: ", median_threshold)

# Create a DataFrame for the predictions and the custid
submission_df = pd.DataFrame({
    'custid': masked_df['custid'],  # Retrieve the 'custid' column from the original DataFrame
    'health_ins': y_pred           # Add the predicted values
})

# Convert the numeric values in 'predicted_y' to 'TRUE' or 'FALSE' based on the median threshold
submission_df['health_ins'] = submission_df['health_ins'].apply(lambda x: 'TRUE' if x > median_threshold else 'FALSE')

# Store the DataFrame without column names
submission_df.to_csv('xgb_submission.csv', index=False)


median_threshold:  0.9325274


In [41]:
def submission_df_filtered(sample_function, params, classifier):
    # Preprocess data
    train_df = df_filtered.drop('custid', axis=1)
    test_df = sub_filtered.drop('custid', axis=1)
    
    preprocessed_train_df = preprocess_dataframe(train_df)
    preprocessed_test_df = preprocess_dataframe(test_df)

    X_train = preprocessed_train_df.drop(target_feature, axis=1)  
    y_train = preprocessed_train_df[target_feature]
    
    # Sample data
    X_resampled, y_resampled = sample_function(X_train, y_train)

        
    X_test = preprocessed_test_df.drop(target_feature, axis=1)  
        
    # Train model with GridSearchCV
    grid_search = GridSearchCV(estimator=classifier, param_grid=params, cv=5)
    grid_search.fit(X_resampled, y_resampled)
    # grid_search.fit(X_train, y)
    
    # Predict
    y_pred = grid_search.predict(X_test)
    
    submission_df = pd.DataFrame({
        'custid': sub_filtered['custid'],  
        'health_ins': y_pred        
    })
        
    return submission_df

In [42]:
def submission_df_missing():
    
    convert_feature_types(df_missing)
    convert_feature_types(sub_missing)

    train_df = df_missing.drop('custid', axis=1)
    test_df = sub_missing.drop('custid', axis=1)
    
    X_train = train_df.drop(target_feature, axis=1)  
    y_train = train_df[target_feature]
    
    X_test = test_df.drop(target_feature, axis=1)  
        
    dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
    dtest = xgb.DMatrix(X_test, enable_categorical=True)

    params = {
        'objective': 'binary:logistic',  # For binary classification
        'max_depth': 6,
        'eta': 0.3,
        'eval_metric': 'logloss'
    }

    num_rounds = 100
    bst = xgb.train(params, dtrain, num_rounds)

    preds = bst.predict(dtest)
    predictions = [1 if pred > 0.5 else 0 for pred in preds]
    
    submission_df = pd.DataFrame({
        'custid': sub_missing['custid'],  
        'health_ins': predictions        
    })
        
    return submission_df

In [43]:
submission_df_filtered = submission_df_filtered(smote_data, {'n_neighbors': [3]}, KNeighborsClassifier())

KeyboardInterrupt: 

In [None]:
submission_df_missing = submission_df_missing()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

ValueError: feature_names mismatch: ['sex', 'is_employed', 'income', 'marital_status', 'housing_type', 'num_vehicles', 'age', 'state_of_res', 'gas_usage', 'rooms', 'recent_move_b', 'gas_category', 'age_income'] ['sex', 'is_employed', 'income', 'marital_status', 'housing_type', 'num_vehicles', 'age', 'state_of_res', 'gas_usage', 'rooms', 'recent_move_b']
expected gas_category, age_income in input data

submission_df = pd.concat([submission_df_filtered, submission_df_missing])

submission_df['health_ins'] = submission_df['health_ins'].map({1.0: 'TRUE', 0.0: 'FALSE'})

submission_df.to_csv('submission-try.csv', index=False)