In [157]:
import pandas as pd
import numpy as np
import random
from IPython.display import display, HTML

In [158]:
train_df=pd.read_csv('./Kaggle Data/train_df.csv')
test_df =pd.read_csv('./Kaggle Data/test_df.csv')
# Cek Directory jika tidak terbaca
#import os
#print("Current Working Directory:", os.getcwd())
display(train_df)

Unnamed: 0,income,credit_risk_score,customer_age,payment_type,employment_status,current_address_months_count,fraud_bool
0,0.2,223,30,AB,CA,80,0
1,0.2,172,60,AB,CB,72,1
2,0.8,-90,60,AB,CC,147,0
3,0.7,280,40,AC,CB,248,1
4,0.6,-18,40,AD,CB,8,0
...,...,...,...,...,...,...,...
745,0.2,282,60,AD,CA,111,1
746,0.4,85,30,AC,CA,8,1
747,0.3,128,60,AA,CC,79,0
748,0.3,29,40,AC,CA,14,0


In [159]:
numerical_features_train = train_df.select_dtypes(include=[np.number])
num_summary_stats_train = numerical_features_train.describe().T
categorical_features_train = train_df.select_dtypes(include=[object])
cat_summary_stats_train = categorical_features_train.describe().T
print("Numerical Features Summary Statistics:")
display(num_summary_stats_train)
print("Categorical Features Summary Statistics:")
display(cat_summary_stats_train)

Numerical Features Summary Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
income,750.0,0.6244,0.282189,0.1,0.4,0.7,0.9,0.9
credit_risk_score,750.0,152.518667,82.755348,-110.0,91.25,144.0,210.75,357.0
customer_age,750.0,37.6,12.902013,10.0,30.0,40.0,50.0,80.0
current_address_months_count,750.0,103.866667,93.722816,-1.0,34.25,71.0,151.0,386.0
fraud_bool,750.0,0.505333,0.500305,0.0,0.0,1.0,1.0,1.0


Categorical Features Summary Statistics:


Unnamed: 0,count,unique,top,freq
payment_type,750,4,AB,277
employment_status,750,6,CA,572


# Data Cleaning

In [160]:
# Berdasarkan metadata yang diberikan -1 berarti missing value untuk "current_address_months_count", kita gunakan imputation median
median_current_address = train_df['current_address_months_count'][train_df['current_address_months_count'] != -1].median()
median_current_address

71.0

# Preprocessing

In [161]:
# Label Encoding
payment_type_map = {
    'AA': 0.353846,
    'AB': 0.485255,
    'AC': 0.614198,
    'AD': 0.472222
}
employment_status_map = {
    'CA': 0.526868,
    'CB': 0.333333,
    'CC': 0.728814,
    'CD': 0.266667,
    'CE': 0.333333,
    'CF': 0.181818
}

class DataPreprocessor:
    def __init__(self):
        self.payment_type_map = {}
        self.employment_status_map = {}
        self.means = {}
        self.std_devs = {}

    def fit(self, df):
        # Label Encoding
        self.payment_type_map = df['payment_type'].value_counts(normalize=True).to_dict()
        self.employment_status_map = df['employment_status'].value_counts(normalize=True).to_dict()

        # Hanya fitur numerik tertentu yang belum terscale akan di standardisasi
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            self.means[feature] = df[feature].mean()
            self.std_devs[feature] = df[feature].std()

    def transform(self, df):
        preprocessed_df = df.copy()

        # Label Encoding
        preprocessed_df['payment_type'] = preprocessed_df['payment_type'].map(self.payment_type_map).astype(float)
        preprocessed_df['employment_status'] = preprocessed_df['employment_status'].map(self.employment_status_map).astype(float)

        # Impute Missing Value
        preprocessed_df['current_address_months_count'] = preprocessed_df['current_address_months_count'].replace(-1, 71)  # 71 is the median

        # Standard Scaling
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            preprocessed_df[feature] = (preprocessed_df[feature] - self.means[feature]) / self.std_devs[feature]
        return preprocessed_df

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [162]:
preprocessor = DataPreprocessor()
preprocessed_train_df = preprocessor.fit_transform(train_df)
preprocessed_test_df = preprocessor.transform(test_df)

In [163]:
X_train = preprocessed_train_df.drop(columns=["fraud_bool"]).values
y_train = preprocessed_train_df["fraud_bool"].values

X_test = preprocessed_test_df.drop(columns=["fraud_bool"]).values
y_test = preprocessed_test_df["fraud_bool"].values

# Modeling

In [164]:
class RBF:
    def __init__(self, spread, centroids, debug=False):
        self.spread = spread
        self.centroids = np.array(centroids)  # Ensure centroids are of proper shape
        self.debug = debug
        self.weights = None  
        self.bias = 1 

    def _basis_function(self, r):
        """Radial basis function: exp(-r^2 / (2 * spread^2))"""
        return np.exp(-r ** 2 / (2 * self.spread ** 2))

    def _format_matrix_html(self, matrix, title):
        """Format a matrix as an HTML table (for debugging purposes)."""
        table_html = f"<b>{title}:</b><br><table style='border:1px solid black;'>"
        for row in matrix:
            table_html += "<tr>"
            for val in row:
                table_html += f"<td style='border:1px solid black; padding:5px;'>{val:.4f}</td>"
            table_html += "</tr>"
        table_html += "</table><br>"
        return table_html

    def _calculate_outputs(self, X):
        """Calculate the outputs of the hidden layer (RBF layer)."""
        # Ensure the centroids match the input dimensionality
        assert X.shape[1] == self.centroids.shape[1], f"Input dimensionality ({X.shape[1]}) must match centroid dimensionality ({self.centroids.shape[1]})"
        
        # Calculate distances from each input point to each centroid
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        if self.debug:
            display(HTML(self._format_matrix_html(distances, "Distances")))
        
        # Apply the RBF function to the distances
        outputs = self._basis_function(distances)
        if self.debug:
            display(HTML(self._format_matrix_html(outputs, "RBF Outputs")))
        return outputs

    def fit(self, X, y):
        """Train the RBF network using the pseudo-inverse."""
        rbf_outputs = self._calculate_outputs(X)
        y = y.reshape(-1, 1)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))  # Add bias +1

        # Calculate weights using pseudo-inverse
        self.weights = np.linalg.pinv(rbf_outputs) @ y

        if self.debug:
            display(HTML(self._format_matrix_html(self.weights, "Calculated Weights (including bias)")))

    def predict(self, X):
        """Predict using the trained RBF network."""
        rbf_outputs = self._calculate_outputs(X)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))  # Add bias
        return rbf_outputs @ self.weights

In [165]:
class GeneticAlgorithm:
    def __init__(self, population_size, num_selection, mutation_rate, exposure_rate, max_iterations, seed, train_dataset, test_dataset, debug=True):
        self.population_size = population_size
        self.num_selection = num_selection
        self.mutation_rate = mutation_rate
        self.exposure_rate = exposure_rate
        self.max_iterations = max_iterations
        self.dataset_X_train, self.dataset_y_train = train_dataset
        self.dataset_X_test, self.dataset_y_test = test_dataset
        self.population = []
        np.random.seed(seed)
        self.debug = debug  # Enable debug mode
        self.initialize_population()

    def calculate_fitness(self):
        fitness_values = []
        for centroids, spread in self.population:
            rbf = RBF(spread, centroids, debug=False)
            rbf.fit(self.dataset_X_train, self.dataset_y_train)  
            predictions = rbf.predict(self.dataset_X_test)

            predicted_classes = (predictions >= 0.5).astype(int)  
            correct_predictions = sum(predicted_classes.flatten() == self.dataset_y_test)
            accuracy = correct_predictions / len(self.dataset_y_test)

            # Calculate precision and recall for F1 score
            true_positive = sum((predicted_classes.flatten() == 1) & (self.dataset_y_test == 1))
            false_positive = sum((predicted_classes.flatten() == 1) & (self.dataset_y_test == 0))
            false_negative = sum((predicted_classes.flatten() == 0) & (self.dataset_y_test == 1))
            
            precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
            recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
            
            # Calculate F1 score
            f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            fitness_values.append(f1_score)  
        
        return fitness_values

    def initialize_population(self):
        dataset_X = self.dataset_X_train  # Use your training dataset features for centroid selection
        for _ in range(self.population_size):
            num_centroids = np.random.randint(6, 500)  # Number of centroids from 6 to 500
            centroids = dataset_X[np.random.choice(dataset_X.shape[0], num_centroids, replace=False)]
            spread = np.random.uniform(0.01, 2)  # Random spread from 0.01 to 2
            self.population.append((centroids, spread))

        # Debug output for initial population
        if self.debug:
            print("Initial Population:")
            for i, (centroids, spread) in enumerate(self.population):
                print(f"Individual {i + 1}: Centroids={centroids}, Spread={spread:.4f}")


    def select(self, fitness_values):
        # Calculate the total fitness
        total_fitness = sum(fitness_values)
    
        # Calculate selection probabilities
        selection_probabilities = [f / total_fitness for f in fitness_values]

        # Cumulative probabilities
        cumulative_probabilities = np.cumsum(selection_probabilities)

        # Select individuals using roulette wheel method
        selected_population = []
        for _ in range(self.num_selection):
            random_value = np.random.rand()  # Generate a random number between 0 and 1
            selected_index = np.searchsorted(cumulative_probabilities, random_value)
            selected_population.append(self.population[selected_index])

        # Debug output for selected population
        if self.debug:
            print("Selected Population:")
            for i, (centroids, spread) in enumerate(selected_population):
                print(f"Selected Individual {i + 1}: Centroids={centroids}, Spread={spread:.4f}")
        return selected_population

    def crossover(self, parent1, parent2):
        x = np.random.uniform(0.2, 0.8)
        centroids1, spread1 = parent1
        centroids2, spread2 = parent2

        num_centroids1 = centroids1.shape[0]
        num_centroids2 = centroids2.shape[0]

        # Child 1
        num_child_centroids1_from_parent1 = round(x * num_centroids1)
        num_child_centroids1_from_parent2 = round((1 - x) * num_centroids2)

        selected_indices1 = np.random.choice(num_centroids1, num_child_centroids1_from_parent1, replace=False)
        selected_centroids1 = centroids1[selected_indices1]  
        selected_indices2 = np.random.choice(num_centroids2, num_child_centroids1_from_parent2, replace=False)
        selected_centroids2 = centroids2[selected_indices2]  
        child1_centroids = np.vstack((selected_centroids1, selected_centroids2))

        # Child 2
        num_child_centroids2_from_parent1 = round((1 - x) * num_centroids1)
        num_child_centroids2_from_parent2 = round(x * num_centroids2)

        selected_indices3 = np.random.choice(num_centroids1, num_child_centroids2_from_parent1, replace=False)
        selected_centroids3 = centroids1[selected_indices3] 
        selected_indices4 = np.random.choice(num_centroids2, num_child_centroids2_from_parent2, replace=False)
        selected_centroids4 = centroids2[selected_indices4]  
        child2_centroids = np.vstack((selected_centroids3, selected_centroids4))

        # Debug output for crossover
        if self.debug:
            print(f"Crossover: x={x:.4f}")
            print(f"Parent 1 Centroids: {centroids1}")
            print(f"Parent 2 Centroids: {centroids2}")
            print(f"Child 1 Centroids: {child1_centroids}")
            print(f"Child 2 Centroids: {child2_centroids}")

        return (child1_centroids, spread1), (child2_centroids, spread2)

    def mutate(self, chromosome):
        centroids, spread = chromosome
        if np.random.rand() < self.exposure_rate:  
            for i in range(len(centroids)):  
                if np.random.rand() < self.mutation_rate: 
                    mutation_factor = np.random.uniform(-self.mutation_rate, self.mutation_rate)
                    centroids[i] = centroids[i] * (1 + mutation_factor)  

            spread += np.random.uniform(-self.mutation_rate, self.mutation_rate)
            spread = np.clip(spread, 0.1, 3)  

            # Debug output for mutation
            if self.debug:
                print(f"Mutating: Centroids={centroids}, Spread={spread:.4f}")

        return centroids, spread

    def run(self):
        best_accuracy = 0  
        best_parameters = None 

        for iteration in range(self.max_iterations):
            fitness_values = self.calculate_fitness()
            print(f"Iteration {iteration + 1}, Best Fitness: {max(fitness_values)}")  

            # Find the index of the best individual in the current population
            best_index = np.argmax(fitness_values)
            best_fitness = fitness_values[best_index]
            best_centroids, best_spread = self.population[best_index]

            # Train the RBF using the best individual's parameters
            rbf = RBF(best_spread, best_centroids, debug=False)
            rbf.fit(self.dataset_X_train, self.dataset_y_train)  # Fit on training data
            predictions = rbf.predict(self.dataset_X_test)  # Predict on test data

            # Calculate accuracy for the best individual
            predicted_classes = (predictions >= 0.5).astype(int)
            correct_predictions = sum(predicted_classes.flatten() == self.dataset_y_test)
            accuracy = correct_predictions / len(self.dataset_y_test)

            # Print only accuracy and check if it's the best so far
            print(f"Iteration {iteration + 1}, Test Accuracy: {accuracy:.4f}")

            # If this accuracy is the best so far, save the parameters and accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_parameters = {
                "centroids": best_centroids,
                "spread": best_spread
            }

            # Select individuals for the next generation
            selected_population = self.select(fitness_values)
            new_population = []

            # Create new population through crossover and mutation
            while len(new_population) < self.population_size:
                parent1, parent2 = random.sample(selected_population, 2)
                child1, child2 = self.crossover(parent1, parent2)
                mutated_child1 = self.mutate(child1)
                mutated_child2 = self.mutate(child2)
                new_population.extend([mutated_child1, mutated_child2])

            # Update population for the next iteration
            self.population = new_population

        # After all iterations, print the best centroids and spread
        print(f"Best Test Accuracy: {best_accuracy:.4f}")
        print(f"Best Parameters: Centroids = {best_parameters['centroids']}, Spread = {best_parameters['spread']}")

In [170]:
# Parameters for the Genetic Algorithm
population_size = 250  # jumlah populasi awal
num_selection = 50    # Jumalah seleksi populasi, rekomendasi 50% populasi awal
mutation_rate = 0.4    # Probabilitas centroid mutasi
exposure_rate = 0.05   # Probabilitas chromosome terkena mutasi
max_iterations = 5     # Iterasi
seed = 2024           # Seed Random

# Create an instance of the Genetic Algorithm
ga = GeneticAlgorithm(
    population_size=population_size,
    num_selection=num_selection,
    mutation_rate=mutation_rate,
    exposure_rate=exposure_rate,
    max_iterations=max_iterations,
    seed=seed,
    train_dataset=(X_train, y_train),
    test_dataset=(X_test, y_test),
    debug=False
)

In [169]:
ga.run()

Iteration 1, Best Fitness: 0.9421052631578947
Iteration 1, Test Accuracy: 0.9413
Iteration 2, Best Fitness: 0.8973684210526316
Iteration 2, Test Accuracy: 0.8960
Iteration 3, Best Fitness: 0.8821989528795812
Iteration 3, Test Accuracy: 0.8800
Iteration 4, Best Fitness: 0.8672798948751641
Iteration 4, Test Accuracy: 0.8653
Iteration 5, Best Fitness: 0.8563899868247695
Iteration 5, Test Accuracy: 0.8547
Best Test Accuracy: 0.9413
Best Parameters: Centroids = [[ 0.1        -1.1179781  -1.36412818  0.11333333  0.01066667 -0.26532138]
 [ 0.9         2.47091383  0.96109031  0.31333333  0.76266667  0.67361755]
 [ 0.5        -0.97297237  0.18601748  0.204       0.76266667  0.8229942 ]
 ...
 [ 0.9         2.44674621  0.18601748  0.36933333  0.76266667 -0.77746988]
 [ 0.8         1.38337082  1.73616314  0.31333333  0.76266667  1.53786815]
 [ 0.7        -0.84005044 -1.36412818  0.31333333  0.76266667  2.40211876]], Spread = 0.9613968690603087
