In [12]:
import pandas as pd
import numpy as np
import html

In [2]:
train_df=pd.read_csv('./Kaggle Data/train_df.csv')
# Cek Directory jika tidak terbaca
#import os
#print("Current Working Directory:", os.getcwd())
display(train_df)

Unnamed: 0,income,credit_risk_score,customer_age,payment_type,employment_status,current_address_months_count,fraud_bool
0,0.2,223,30,AB,CA,80,0
1,0.2,172,60,AB,CB,72,1
2,0.8,-90,60,AB,CC,147,0
3,0.7,280,40,AC,CB,248,1
4,0.6,-18,40,AD,CB,8,0
...,...,...,...,...,...,...,...
745,0.2,282,60,AD,CA,111,1
746,0.4,85,30,AC,CA,8,1
747,0.3,128,60,AA,CC,79,0
748,0.3,29,40,AC,CA,14,0


In [3]:
numerical_features_train = train_df.select_dtypes(include=[np.number])
num_summary_stats_train = numerical_features_train.describe().T
categorical_features_train = train_df.select_dtypes(include=[object])
cat_summary_stats_train = categorical_features_train.describe().T
print("Numerical Features Summary Statistics:")
display(num_summary_stats_train)
print("Categorical Features Summary Statistics:")
display(cat_summary_stats_train)

Numerical Features Summary Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
income,750.0,0.6244,0.282189,0.1,0.4,0.7,0.9,0.9
credit_risk_score,750.0,152.518667,82.755348,-110.0,91.25,144.0,210.75,357.0
customer_age,750.0,37.6,12.902013,10.0,30.0,40.0,50.0,80.0
current_address_months_count,750.0,103.866667,93.722816,-1.0,34.25,71.0,151.0,386.0
fraud_bool,750.0,0.505333,0.500305,0.0,0.0,1.0,1.0,1.0


Categorical Features Summary Statistics:


Unnamed: 0,count,unique,top,freq
payment_type,750,4,AB,277
employment_status,750,6,CA,572


# Data Cleaning

In [4]:
# Berdasarkan metadata yang diberikan -1 berarti missing value untuk "current_address_months_count", kita gunakan imputation median
median_current_address = train_df['current_address_months_count'][train_df['current_address_months_count'] != -1].median()
median_current_address

71.0

# Preprocessing

In [5]:
# Label Encoding
payment_type_map = {
    'AA': 0.353846,
    'AB': 0.485255,
    'AC': 0.614198,
    'AD': 0.472222
}
employment_status_map = {
    'CA': 0.526868,
    'CB': 0.333333,
    'CC': 0.728814,
    'CD': 0.266667,
    'CE': 0.333333,
    'CF': 0.181818
}

class DataPreprocessor:
    def __init__(self):
        self.payment_type_map = {}
        self.employment_status_map = {}
        self.means = {}
        self.std_devs = {}

    def fit(self, df):
        # Label Encoding
        self.payment_type_map = df['payment_type'].value_counts(normalize=True).to_dict()
        self.employment_status_map = df['employment_status'].value_counts(normalize=True).to_dict()

        # Hanya fitur numerik tertentu yang belum terscale akan di standardisasi
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            self.means[feature] = df[feature].mean()
            self.std_devs[feature] = df[feature].std()

    def transform(self, df):
        preprocessed_df = df.copy()

        # Label Encoding
        preprocessed_df['payment_type'] = preprocessed_df['payment_type'].map(self.payment_type_map).astype(float)
        preprocessed_df['employment_status'] = preprocessed_df['employment_status'].map(self.employment_status_map).astype(float)

        # Impute Missing Value
        preprocessed_df['current_address_months_count'] = preprocessed_df['current_address_months_count'].replace(-1, 71)  # 71 is the median

        # Standard Scaling
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            preprocessed_df[feature] = (preprocessed_df[feature] - self.means[feature]) / self.std_devs[feature]
        return preprocessed_df

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [9]:
preprocessor = DataPreprocessor()
preprocessed_train_df = preprocessor.fit_transform(train_df)

# Modeling

In [13]:
class RBF:
    def __init__(self, spread, centroids, debug=True):
        self.spread = spread
        self.centroids = np.array(centroids)
        self.debug = debug
        self.weights = None  
        self.bias = 1 

    def _basis_function(self, r):
        """Radial basis function: exp(-r^2 / (2 * spread^2))"""
        return np.exp(-r ** 2 / (2 * self.spread ** 2))

    def _format_matrix_html(self, matrix, title):
        """Format a matrix as an HTML table."""
        table_html = f"<b>{title}:</b><br><table style='border:1px solid black;'>"
        for row in matrix:
            table_html += "<tr>"
            for val in row:
                table_html += f"<td style='border:1px solid black; padding:5px;'>{val:.4f}</td>"
            table_html += "</tr>"
        table_html += "</table><br>"
        return table_html

    def _calculate_outputs(self, X):
        """Calculate the outputs of the hidden layer."""
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)  
        if self.debug:
            display(HTML(self._format_matrix_html(distances, "Distances")))
        
        outputs = self._basis_function(distances)
        if self.debug:
            display(HTML(self._format_matrix_html(outputs, "RBF Outputs")))
        return outputs

    def fit(self, X, y):
        """Train the RBF network using the pseudo-inverse."""
        rbf_outputs = self._calculate_outputs(X)
        y = y.reshape(-1, 1)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))  # Add bias +1

        # Calculate weights using pseudo-inverse
        self.weights = np.linalg.pinv(rbf_outputs) @ y

        if self.debug:
            display(HTML(self._format_matrix_html(self.weights, "Calculated Weights (including bias)")))

    def predict(self, X):
        """Predict using the trained RBF network."""
        rbf_outputs = self._calculate_outputs(X)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))  # Add bias
        return rbf_outputs @ self.weights