# Gini

In [8]:
import numpy as np
from numba import jit

In [9]:
@jit
def gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n - 1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# What is the gini?

The Gini coefficient is a measure of inequality that can be calculated using the following steps:

    Rank the values of the variable being measured in ascending order.

    Calculate the cumulative relative frequency of each value, which is the proportion of observations with values less than or equal to that value.

    Calculate the Lorenz curve, which plots the cumulative relative frequency on the y-axis and the rank of the values on the x-axis.

    Calculate the Gini coefficient by dividing the area between the Lorenz curve and the line of perfect equality (where the cumulative relative frequency is equal to the rank of the values) by the total area under the line of perfect equality.

In [10]:
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
boosting_preds = pd.read_csv("Results/lgbm_bayesian_opt.csv")['target']
targets = pd.read_csv("Dataset/target_test.csv")['target']



# Combine the predictions and target values
values = list(zip(boosting_preds, targets))

# Sort the values in ascending order
ranked_values = sorted(values)

# Calculate the cumulative relative frequency
cumulative_relative_frequency = ranked_values.cumsum() / len(ranked_values)
# Calculate the normalized Gini coefficient

gini = gini(targets, boosting_preds)

# Create the scatter plot
plt.scatter(ranked_values, cumulative_relative_frequency)

# Create the line plot
plt.plot(ranked_values, ranked_values)

# Show the plot
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset/target_test.csv'