In [95]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from tabulate import tabulate
from scipy import stats
from scipy.stats import skew, kurtosis, norm, kstest
import statsmodels.api as sm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import PowerTransformer, StandardScaler, QuantileTransformer, OneHotEncoder

In [96]:
train_df = pd.read_csv("./train_data.csv")
test_df = pd.read_csv("./test_data.csv")

In [97]:
def corr_matrix(df):
    correlation_matrix = df.corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Matriz de correlaciones')
    plt.show()

In [98]:
def norm_test(data, p_thres = 0.05):
    mean, std = norm.fit(data)

    if std == 0:
        return 'No normal', 1e-8

    normal = norm(loc = mean, scale = std)
    _, p_value = stats.kstest(data, normal.cdf)

    if p_value > p_thres:
        normality = "Normal"
    else:
        normality = "No normal"

    return normality, p_value

In [99]:
def dataframe_statistics(dataframe):
    data = dataframe.copy()

    results = {}

    for col in data.columns:
        
        if pd.api.types.is_numeric_dtype(data[col]):
            data_stats = data[col].describe(percentiles=[0.25, 0.75])
            data_stats['skewness'] = skew(data[col])
            data_stats['kurtosis'] = kurtosis(data[col])
            data_stats['normalness'], _ = norm_test(data[col], 0.05)
            data_stats['uniques'] = data[col].nunique()
            data_stats['null count'] = data[col].isnull().sum()
            data_stats['data type'] = type(data[col][0])
            results[col] = data_stats
        else:
            results[col] = {'count': data[col].count(), 'unique': data[col].nunique(), 'top': data[col].mode().iloc[0]}

    
    print(tabulate(pd.DataFrame(results), headers='keys', tablefmt='grid'))

In [100]:
def plot_function(data, type):
    num_plots = min(16, len(data.columns))
    num_cols = 4
    num_rows = (num_plots + num_cols - 1) // num_cols

    fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 10))
    axs = axs.ravel()

    for i, col in enumerate(data.select_dtypes(include=[np.number])): 
        if i >= num_plots:
            break
        if type == 'QQPlot': 
            sm.qqplot(data[col], line='r', ax=axs[i], fit=True)
            axs[i].set_xlabel("Theoretical Quantiles")
            axs[i].set_ylabel("Sample Quantiles")
            axs[i].set_title(f"{col}")
        elif type == 'BoxPlot':
            sns.boxplot(y=data[col], orient="v", ax=axs[i])  
            axs[i].set_xlabel("")
            axs[i].set_ylabel(col)
            axs[i].set_title(f"Boxplot de {col}")

    
    for j in range(num_plots, num_rows * num_cols):
        fig.delaxes(axs[j])

    plt.tight_layout()
    plt.show()

In [101]:
def compute_missing_data(data, method, testing=False, percentage=0.15):
    
    modified_data = data.copy()

    
    if testing:
        
        for column in data:
            
            indices_to_null = np.random.choice(data.index, size=int(len(data) * percentage), replace=False)
            
            modified_data.loc[indices_to_null, column] = np.nan

    
    if method in ['Mean', 'Median', 'Mode', 'Arbitrary']:
        
        for column in data:
            
            for index in modified_data[modified_data[column].isnull()].index:
                
                if method == 'Mean':
                    
                    value = data[column].mean()
                elif method == 'Median':
                    
                    value = data[column].median()
                elif method == 'Mode':
                    
                    value = data[column].mode().iloc[0]
                elif method == 'Arbitrary':
                    
                    value = np.random.uniform(data[column].min(), data[column].max())
                else:
                    
                    value = np.nan

                
                if isinstance(value, np.ndarray):
                    value = value[0]
                
                value = data[column].dtype.type(value)

                
                modified_data.loc[index, column] = value

    
    elif method in ['KNN', 'MICE']: 
        if method == 'KNN':
            
            imputer = KNNImputer(n_neighbors=5) 
            
            imputed = imputer.fit_transform(modified_data)
            
            modified_data = pd.DataFrame(imputed, columns=modified_data.columns)
        elif method == 'MICE':
            
            imputer = IterativeImputer()
            
            imputed = imputer.fit_transform(modified_data) 
            
            modified_data = pd.DataFrame(imputed, columns=modified_data.columns)

    
    return modified_data


In [102]:


def handle_outliers(data, method, imputation_method='KNN', winsorization_rate=0.05):
    
    original_data = data.copy()
    
    modified_data = data.copy()

    if method == 'Imputacion':
        for column in modified_data.columns:
          
            q1 = np.percentile(modified_data[column], 25)
            q3 = np.percentile(modified_data[column], 75)
            iqr = q3 - q1
           
            lower_limit = q1 - 1.5 * iqr
            upper_limit = q3 + 1.5 * iqr
          
            modified_data[column] = np.where(modified_data[column] < lower_limit, np.nan, np.where(modified_data[column] > upper_limit, np.nan, modified_data[column]))

        
        outlierless_data = compute_missing_data(modified_data, imputation_method, False, None)
        return outlierless_data
    
    for column in modified_data.columns:
    
        q1 = np.percentile(original_data[column], 25)
        q3 = np.percentile(original_data[column], 75)
        iqr = q3 - q1
        
        lower_limit = q1 - 1.5 * iqr
        upper_limit = q3 + 1.5 * iqr

        if method == 'Trimming':
            
            mask = (modified_data[column] >= lower_limit) & (modified_data[column] <= upper_limit) 
            modified_data = modified_data[mask].reset_index(drop=True) 
            outlierless_data = modified_data.loc[mask] 
        elif method == 'Capping':
            
            modified_data[column] = np.where(modified_data[column] < lower_limit, lower_limit, np.where(modified_data[column] > upper_limit, upper_limit, modified_data[column]))
            outlierless_data = modified_data 
        elif method == 'Winsorization':
            
            lower_winsor = np.percentile(modified_data[column], 100 * winsorization_rate)
            upper_winsor = np.percentile(modified_data[column], 100 * (1 - winsorization_rate))
            modified_data[column] = np.where(modified_data[column] < lower_winsor, lower_winsor, np.where(modified_data[column] > upper_winsor, upper_winsor, modified_data[column]))
            outlierless_data = modified_data
        else:
            print(f"No hubo match {method}")

    return outlierless_data

In [103]:
def transform_data(data, method, p_thres):
    modified_data = data.copy()
    method_used = []

    if method == 'Auto':  
        for column in modified_data.columns:
            methods = ['Exp', 'BoxCox', 'Yeo-Johnson']  

            

            
            exp_data = np.exp(modified_data[column])
            _, p_val_exp = norm_test(exp_data, p_thres = p_thres)

            print(exp_data)

            
            pt = PowerTransformer(method = 'box-cox')
            if (modified_data[column] <= 0.0).any():
                
                box_data = modified_data[column]
                p_val_box = 1e-20
            else:
                box_data = pt.fit_transform(modified_data[column].values.reshape(-1, 1))
                _, p_val_box = norm_test(box_data, p_thres = p_thres)

            print(box_data)

            
            pt = PowerTransformer(method = 'yeo-johnson')
            yeo_data = pt.fit_transform(modified_data[column].values.reshape(-1, 1))
            _, p_val_yeo = norm_test(yeo_data, p_thres = p_thres)

            print(yeo_data)

            
            
            
            

            data = [exp_data, box_data, yeo_data]

            print(data)
            
            p_vals = [p_val_exp, p_val_box, p_val_yeo] 
            index = np.argmax(p_vals)  
            modified_data[column] = data[index]  
            method_used.append(methods[index]) 
        return modified_data, method_used
    else:
        for column in modified_data.columns:
            
            if norm_test(modified_data[column], p_thres = p_thres) == 'Normal':
                continue

            if method == 'Exp':
                if (modified_data[column] > 20).any(): 
                    print('Los numeros son demasiado grandes para hacer transformacion exponencial')
                    method_used.append('none') 
                    continue
                modified_data[column] = np.exp(modified_data[column]) 
                method_used.append('Exp')  
            elif method == 'box-cox':
                if (modified_data[column] <= 0.0).any():  
                    print('Hay numeros negativos o cero, no se puede hacer box-cox')
                    method_used.append('none')
                    continue
                pt = PowerTransformer(method = method) 
                modified_data[column] = pt.fit_transform(modified_data[column].values.reshape(-1, 1)) 
                method_used.append('box-cox')
            elif method == 'yeo-johnson':
                pt = PowerTransformer(method = method)
                modified_data[column] = pt.fit_transform(modified_data[column].values.reshape(-1, 1)) 
                method_used.append('Yeo-Johnson')
            elif method == 'Cuartiles':
                qt = QuantileTransformer(n_quantiles = 100, output_distribution='normal')
                modified_data[column] = qt.fit_transform(modified_data[[column]]) 
                method_used.append('Cuartiles')
            else:
                print(f"No hubo match {method}")
    return modified_data, method_used

In [104]:
def reduce_dimensionality(data, method, corr_thres = 0.95, var_thres = 0.01, normality_thres = 0.01, explained_var = 0.99, do_ica = False, filter_non_normal = False):
    modified_data = data.copy()

    if method == 'Filter':
        
        for column in modified_data.columns:  
            var = modified_data[column].var() 

            _, p_val = norm_test(modified_data[column], normality_thres) 

            if var < var_thres:  
                print(f'Quitando {column} por varianza {var}')  
                modified_data.drop(column, axis=1, inplace=True)  
            elif p_val < normality_thres and filter_non_normal:  
                print(f'Quitando {column} por no normal {p_val}')  
                modified_data.drop(column, axis=1, inplace=True)  

        
        corr_matrix = np.corrcoef(modified_data, rowvar=False)
        
        np.fill_diagonal(corr_matrix, 0)
        corr_mask = np.abs(corr_matrix) < corr_thres 
        modified_data = modified_data.loc[:, np.all(corr_mask, axis=0)] 

    elif method == 'Projection': 
        scaler = StandardScaler()   
        modified_data = scaler.fit_transform(modified_data) 

        
        pca = PCA(explained_var) 
        modified_data = pca.fit_transform(modified_data)
        print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
        
        if pca.n_components_ > 1 and do_ica:
            ica = FastICA()
            modified_data = ica.fit_transform(modified_data) 

        modified_data = pd.DataFrame(modified_data, columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]) 

    else:
        print(f"No hubo match {method}")

    return modified_data


Primero revisamos el dataset. Antes de cualquier manipulacion de los datos, debemos explorar, entender los datos y finalmente comprender lo que deseamos hacer con los mismos. En este caso lo que se intenta es determinar si una persona es elegible para una tarjeta de credito. La variable de interes para este caso es "Is High Risk".

In [105]:
train_df.head(5)

Unnamed: 0,ID,Gender,Has a car,Has a property,Children count,Income,Employment status,Education level,Marital status,Dwelling,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Job title,Family member count,Account age,Is high risk
0,5037048,M,Y,Y,0,135000.0,Working,Secondary / secondary special,Married,With parents,-16271,-3111,1,0,0,0,Core staff,2.0,-17.0,0
1,5044630,F,Y,N,1,135000.0,Commercial associate,Higher education,Single / not married,House / apartment,-10130,-1651,1,0,0,0,Accountants,2.0,-1.0,0
2,5079079,F,N,Y,2,180000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-12821,-5657,1,0,0,0,Laborers,4.0,-38.0,0
3,5112872,F,Y,Y,0,360000.0,Commercial associate,Higher education,Single / not married,House / apartment,-20929,-2046,1,0,0,1,Managers,1.0,-11.0,0
4,5105858,F,N,N,0,270000.0,Working,Secondary / secondary special,Separated,House / apartment,-16207,-515,1,0,1,0,,1.0,-41.0,0


El primer analisis exploratorio a realizar es si hay alguna variable que no aporta significancia al resultado. En este dataset podemos ver que el ID de un cliente no ayuda a determinar si una persona es o no elegible para tarjeta de credito. Es simplemente un valor que el banco les asignó para identificarlos.

In [106]:
train_df_step1 = train_df.drop(['ID'], axis=1)

Explorando mas a fondo, vemos que todas las personas del dataset tienen telefono movil, por lo que tampoco aporta informacion.

In [107]:
print(train_df_step1['Has a mobile phone'].unique())
train_df_step2 = train_df_step1.drop(['Has a mobile phone'], axis=1)

[1]


Ahora nos enfocamos en el tipo de variables que hay: 
- Categoricas
    - Nominales
    - Ordinales
- Numericas
    - Discretas
    - Continuas

Las numericas no requieren transformacion, pero las categoricas si se deben convertir en numericas para poder analizarlas con cualquier modelo. Las mas sencillas son Gender, Has car y Has a property, ya que se remplazan directamente por 1 y 0.

In [108]:
with pd.option_context("future.no_silent_downcasting", True): #hacemos esto porque remplazar toda una columna de string por numeros enteros levanta sospechas a pandas
    train_df_step2['Gender'] = train_df_step2['Gender'].replace({'M': 0, 'F': 1})
    train_df_step2['Has a car'] = train_df_step2['Has a car'].replace({'N':0, 'Y':1})
    train_df_step2['Has a property'] = train_df_step2['Has a car'].replace({'N':0, 'Y':1})

Employment status, Education level, Marital status, Dwelling y Job title debemos analizarlas con mas detalle para ver cuantas categorias contiene cada una.

In [109]:
print(f"Employment status: {train_df['Employment status'].unique()}\n")
print(f"Education level: {train_df['Education level'].unique()}\n")
print(f"Marital status: {train_df['Marital status'].unique()}\n")
print(f"Dwelling: {train_df['Dwelling'].unique()}\n")
print(f"Job title: {train_df['Job title'].unique()}")


Employment status: ['Working' 'Commercial associate' 'Pensioner' 'State servant' 'Student']

Education level: ['Secondary / secondary special' 'Higher education' 'Lower secondary'
 'Incomplete higher' 'Academic degree']

Marital status: ['Married' 'Single / not married' 'Separated' 'Civil marriage' 'Widow']

Dwelling: ['With parents' 'House / apartment' 'Municipal apartment'
 'Rented apartment' 'Office apartment' 'Co-op apartment']

Job title: ['Core staff' 'Accountants' 'Laborers' 'Managers' nan 'Sales staff'
 'Medicine staff' 'High skill tech staff' 'HR staff' 'Low-skill Laborers'
 'Drivers' 'Secretaries' 'Cleaning staff' 'Cooking staff' 'Security staff'
 'Private service staff' 'IT staff' 'Waiters/barmen staff' 'Realty agents']


Employment status, Marital status, Dwelling y Job title son variables nominales mientras que Education level puede considerarse ordinal. 

A las variables nominales les podriamos aplicar One Hot Encoding, pero entre todas ellas tenemos 35 valores, por lo que terminariamos con muchas mas dimensiones que variables iniciales. Podriamos utilizar Binary Encoding, pero exploremos un poco mas los datos. Una observacion importante es que Job title tiene valores nulos, pero siempre se debe analizar con cuidado los datos. Uno podria suponer que aquellas personas sin titulo de trabajo, estan desempleadas, pero no debemos suponer.

In [110]:
nan_job_df = train_df[train_df['Job title'].isna()]
print(f"Employment status: {nan_job_df['Employment status'].unique()}\n")

Employment status: ['Working' 'Pensioner' 'Commercial associate' 'State servant']



Podemos ver que efectivamente algunas personas sin Job title asignado, tienen Employment status que indica que si tienen trabajo. En este caso, el titulo del trabajo de una persona no nos brinda tanta informacion sobre su eligibilidad para recibir una tarjeta de credito (principalmente porque tenemos la informacion completa acerca de sus ingresos). Pero si se quisiera utilizar esta informacion, se podria hacer imputacion de datos (algun algoritmo de clusterizacion como KNN, asumir la moda o incluso entrenar un modelo para primero rellenar esos valores). Quitando Job title, podemos utilizar One Hot Encoding sin agregar tantas dimensiones al dataset.

In [111]:
train_df_step3 = train_df_step2.drop(['Job title'], axis=1)

encoder = OneHotEncoder(sparse=False, drop='first')
train_df_step4 = encoder.fit_transform(train_df_step3[['Employment status', 'Education level', 'Marital status', 'Dwelling']])



TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'