In [1]:
# Import libraries and packages
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import KBinsDiscretizer
from feature_engine.discretisation import DecisionTreeDiscretiser

# Define path with .py codes containing functions used in this script
os.getcwd()
os.chdir( '../src/features')

# Import useful functions for this script  
from tracking import track

track("-"*25 + "DATA DISCRETIZATION" + "-"*25)

# Reading data

#### Define path to data files

In [2]:
track("Defining path to data files")

# Define base path to data files
path = '../../temp_data/'

# Define path to the table that will be used in this project
path_preprocess_data = path + 'model_data.csv'

# Ensure the input file exist
assert os.path.isfile(path_preprocess_data), f'{path_preprocess_data} not found. Is it a file?'

#### Read the files

In [3]:
# Read model_data table
track("Reading files")
preprocess_data = pd.read_csv(path_preprocess_data)
track("Finished reading files")

In [4]:
# Drop the column author name
data = preprocess_data.iloc[:,1:len(preprocess_data.columns)]

In [5]:
# Defining the columns to discretize.
cont_cols = data.columns[1:]

# Discretization methods 

In [6]:
def discretize_data(data,columns,method):
    '''
    Objective:
        - Modify continous variables of the dataset into discrete ones using different methods.
    Input:
        - data: Dataset to be modified.
        - columns: Name of the columns that contain continuous data.
        - method: Discretization method to apply.
    Output:
        - Discretized dataset.
    '''
    df = data.copy()
    # Unsupervised approaches.
    if method == 1:
        # K-Means 
        disc = KBinsDiscretizer(n_bins = 4, encode='ordinal', strategy='kmeans')
        for col in columns:
            disc.fit(df[[col]])
            df[col] = disc.transform(df[[col]])
    elif method == 2:
        # Equal-Frequency
        for col in columns:
            df[col] = pd.qcut(df[col],4,labels = False, duplicates = "drop")
    elif method == 3:
        # Equal-Width
        for col in columns:
            df[col] = pd.cut(df[col],4,labels = False)
    # Supervised approach.
    # Decision tree classifier.
    elif method == 4:
        '''
        X = df.drop(["target"],axis = 1)
        Y = df["target"]
        disc = DecisionTreeDiscretiser(cv=10,
                          scoring='accuracy',
                          variables= columns,
                          regression=False)
        disc.fit(X,Y)
        X = disc.transform(X)
        df = df.concat([X,Y],axis = 1)
        '''
    return df

In [None]:
disc = discretize_data(data,cont_cols,2)
track("Finished discretizing data")