In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class FormatColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        names = newx.dropna(subset=['Name']).drop_duplicates('Customer_ID').set_index('Customer_ID')['Name']
        newx['Name'] = newx['Name'].fillna(newx['Customer_ID'].map(names)) 

        newx['Age'] = newx['Age'].astype(str).str.extract(r'(-?\d+)')
        newx['Age'] = newx['Age'].astype(int)

        newx['Annual_Income'] = newx['Annual_Income'].astype(str).str.extract(r'(-?\d+)')
        newx['Annual_Income'] = newx['Annual_Income'].astype(float)

        newx['Num_of_Loan'] = newx['Num_of_Loan'].astype(str).str.extract(r'(-?\d+)')
        newx['Num_of_Loan'] = newx['Num_of_Loan'].astype(int)

        newx['Outstanding_Debt'] = newx['Outstanding_Debt'].astype(str).str.extract(r'(-?\d+)')
        newx['Outstanding_Debt'] = newx['Outstanding_Debt'].astype(float)

        newx['Amount_invested_monthly'] = newx['Amount_invested_monthly'].astype(str).str.extract(r'(-?\d+)')
        newx['Amount_invested_monthly'] = newx['Amount_invested_monthly'].astype(float)

        newx['Num_of_Delayed_Payment'] = newx['Num_of_Delayed_Payment'].astype(str).str.extract('(\d+)')
        newx['Num_of_Delayed_Payment'] = pd.to_numeric(newx['Num_of_Delayed_Payment'], errors='coerce')
        newx['Delay_from_due_date'] = newx['Delay_from_due_date'].abs()

        newx['Monthly_Balance'] = newx['Monthly_Balance'].astype(str).str.extract(r'(-?\d+)')
        newx['Monthly_Balance'] = newx['Monthly_Balance'].astype(float)
        newx.loc[newx['Monthly_Balance'] < 0, 'Monthly_Balance'] = np.nan
        
        return newx

In [3]:
class CorrectOutliersAge(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        #Considerando como outlier, em primeiro momento, valores menores que 0 e maiores que 100. 
        outliers = (newx['Age'] < 0) | (newx['Age'] > 100)
        median_age = newx.loc[~outliers, 'Age'].median()
        
        for idx, row in newx[outliers].iterrows():
            customer_valid_age = newx[(newx['Customer_ID'] == row['Customer_ID']) & ~outliers]

            if not customer_valid_age.empty:
                replace_age = customer_valid_age['Age'].iloc[0]     
            else:   
                replace_age = median_age
                
            newx.at[idx, 'Age'] = replace_age
        return newx

In [4]:
class CorrectVariationAge(BaseEstimator, TransformerMixin):
    """
    > Verifica a diferença entre as idades máximas e mínimas registradas para cada "Customer_ID";
    > Identifica Customer_IDs que têm uma variação de idade maior que 10 anos. Para cada um desses Customer_IDs identificados:
        * Calcula a média e o desvio padrão das idades;
        * Substitui pela mediana idades que estão fora do intervalo [média - 2 . desvio padrão, média + 2 . desvio padrão]. 
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        age_dif = newx.groupby('Customer_ID')['Age'].agg(['max', 'min'])
        age_dif['dif'] = age_dif['max'] - age_dif['min']
        variation = age_dif[age_dif['dif'] > 10].index

        for customer_id in variation:
            customer_data = newx[newx['Customer_ID'] == customer_id]

            mean_age = customer_data['Age'].mean()
            std_age = customer_data['Age'].std()
            median_age = customer_data['Age'].median()
            
            outlier = (customer_data['Age'] < mean_age - 2*std_age) | (customer_data['Age'] > mean_age + 2*std_age)
            newx.loc[(newx['Customer_ID'] == customer_id) & outlier, 'Age'] = median_age

        return newx

In [5]:
class CorrectOccupation_Payment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        occupation_mode = newx[newx['Occupation'] != "_______"].groupby('Customer_ID')['Occupation'].agg(pd.Series.mode).reset_index()
        occupation_mode['Occupation'] = occupation_mode['Occupation'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
        
        newx = newx.merge(occupation_mode, on='Customer_ID', how='left', suffixes=('', '_mode'))
        newx['Occupation'] = np.where(
            newx['Occupation'] == "_______",
            newx['Occupation_mode'],
            newx['Occupation']
        )
        
        newx.loc[newx['Occupation'] == 'Media_Manager', 'Occupation'] = 'Manager'
        newx = newx.drop(columns=['Occupation_mode'])
        
        mode_values = newx[newx['Payment_Behaviour'] != '!@9#%8'].groupby('Customer_ID')['Payment_Behaviour'].agg(lambda x: mode(x)[0][0])
        newx.loc[newx['Payment_Behaviour'] == '!@9#%8', 'Payment_Behaviour'] = newx.loc[newx['Payment_Behaviour'] == '!@9#%8', 'Customer_ID'].map(mode_values)
        
        return newx

In [6]:
class FillNaMonthlySalary(BaseEstimator, TransformerMixin):
    """
    > Se o "Customer_ID" desse "Monthly_Inhand_Salary" com valor nulo for igual ao "Customer_ID" da linha de cima, 
    atribuir o valor de "Monthly_Inhand_Salary" da linha de cima, caso não seja nulo;

    > Se o "Customer_ID" desse "Monthly_Inhand_Salary" com valor nulo for diferente do "Customer_ID" da linha de cima, 
    verificar a linha de baixo (se o "Customer_ID" desse "Monthly_Inhand_Salary" com valor nulo é igual ao "Customer_ID" da linha de baixo),
    se sim, atribuir o valor de "Monthly_Inhand_Salary" da linha de baixo, caso não seja nulo!

    > Verifica se a linha na qual está sendo tratada é a primeira ou a última do dataset;

    > Se existir mais de um valor nulo linha embaixo da outra, procurar a linha mais próxima com mesmo "Customer_ID" para preencher.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()

        for idx in range(len(X_copy)):
            self.fill_na_Monthly_Inhand_Salary(idx, X_copy)   
             
        return X_copy
    
    def fill_na_Monthly_Inhand_Salary(self, row_idx, df):
        if pd.isna(df.loc[row_idx, 'Monthly_Inhand_Salary']):
            
            if (row_idx > 0 and 
                df.loc[row_idx, 'Customer_ID'] == df.loc[row_idx - 1, 'Customer_ID'] and 
                not pd.isna(df.loc[row_idx - 1, 'Monthly_Inhand_Salary'])):
                fill_value = df.loc[row_idx - 1, 'Monthly_Inhand_Salary']
                
                while (row_idx < len(df) and 
                    df.loc[row_idx, 'Customer_ID'] == df.loc[row_idx - 1, 'Customer_ID'] and 
                    pd.isna(df.loc[row_idx, 'Monthly_Inhand_Salary'])):
                    df.loc[row_idx, 'Monthly_Inhand_Salary'] = fill_value
                    row_idx += 1 
                    
            else:
                below_idx = row_idx + 1
                
                while (below_idx < len(df) and 
                    (df.loc[below_idx, 'Customer_ID'] != df.loc[row_idx, 'Customer_ID'] or 
                        pd.isna(df.loc[below_idx, 'Monthly_Inhand_Salary']))):
                    below_idx += 1
                
                if (below_idx < len(df) and 
                    df.loc[below_idx, 'Customer_ID'] == df.loc[row_idx, 'Customer_ID']):
                    fill_value = df.loc[below_idx, 'Monthly_Inhand_Salary']
                    
                    while (row_idx < below_idx and 
                        pd.isna(df.loc[row_idx, 'Monthly_Inhand_Salary'])):
                        df.loc[row_idx, 'Monthly_Inhand_Salary'] = fill_value
                        row_idx += 1

In [7]:
class TrfNumBankAccounts(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        #Verificando se a pessoa tem pelo menos 1 cartão de crédito, e substituindo o número de contas bancárias de 0 para 1.
        newx['Num_Bank_Accounts'] = np.abs(newx['Num_Bank_Accounts'])
        newx.loc[(newx['Num_Bank_Accounts'] == 0) & (newx['Num_Credit_Card'] >= 1), 'Num_Bank_Accounts'] = 1
        
        return newx

In [8]:
class ReplaceOutliers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        return X_copy.groupby('Customer_ID').apply(self.replace_outliers)
    
    def replace_outliers(self, sub_df):
        #Por Customer_ID, tratando valores outliers pela moda. Os outliers foram verificados através de alguns percentis.
        sub_df.loc[sub_df['Num_Bank_Accounts'] > 10, 'Num_Bank_Accounts'] = sub_df['Num_Bank_Accounts'].mode().iloc[0]
        sub_df.loc[sub_df['Num_Credit_Card'] > 10, 'Num_Credit_Card'] = sub_df['Num_Credit_Card'].mode().iloc[0]
        sub_df.loc[sub_df['Interest_Rate'] > 25, 'Interest_Rate'] = sub_df['Interest_Rate'].mode().iloc[0]
        return sub_df

In [9]:
class TrfNumOfLoan(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        #Valores nulos e menores que zero, serão transformados em 0, utilizando das análises que a pessoa, então, não fez nenhum empréstimo.
        newx.loc[(newx['Type_of_Loan'].isna()) & (newx['Num_of_Loan'] < 0), 'Num_of_Loan'] = 0
        return newx

In [10]:
class TrfCreditHistoryAge(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        newx[['Years_Credit_History_Age', 'Months_Credit_History_Age']] = newx['Credit_History_Age'].astype(str).str.extract(r'(\d+) Years and (\d+) Months').astype(float)
        newx['Total_Months_Credit_History_Age'] = (newx['Years_Credit_History_Age'] * 12) + newx['Months_Credit_History_Age']
        newx['Total_Months_Credit_History_Age'] = newx.groupby('Customer_ID')['Total_Months_Credit_History_Age'].transform(lambda x: x.interpolate(limit_area='inside'))
        
        def update_edge_nans(x):
            if pd.isna(x.iloc[0]):
                first_valid = x.first_valid_index()
                x.iloc[0] = x.loc[first_valid] - 1
            
            if pd.isna(x.iloc[-1]):
                last_valid = x.last_valid_index()
                x.iloc[-1] = x.loc[last_valid] + 1
            
            x = x.interpolate(limit_area='inside')
            return x
        
        newx['Total_Months_Credit_History_Age'] = newx.groupby('Customer_ID')['Total_Months_Credit_History_Age'].transform(update_edge_nans)
        newx['Total_Months_Credit_History_Age'] = newx['Total_Months_Credit_History_Age'].astype(int)
        return newx

In [11]:
class FillNaNumOfLoan(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        indices = X_copy.loc[(X_copy['Type_of_Loan'].isna()) & (X_copy['Num_of_Loan'] > 0)].index
        
        for idx in indices:
            self.fill_na_Num_of_Loan(idx, X_copy)
        
        return X_copy
    
    def fill_na_Num_of_Loan(self, row_idx, df):
        if (row_idx > 0 and 
            df.loc[row_idx, 'Customer_ID'] == df.loc[row_idx - 1, 'Customer_ID'] and 
            not pd.isna(df.loc[row_idx - 1, 'Num_of_Loan'])):
            fill_value = df.loc[row_idx - 1, 'Num_of_Loan']
            df.loc[row_idx, 'Num_of_Loan'] = fill_value
        
        else:
            below_idx = row_idx + 1
            
            while (below_idx < len(df) and 
                   (df.loc[below_idx, 'Customer_ID'] != df.loc[row_idx, 'Customer_ID'] or 
                    pd.isna(df.loc[below_idx, 'Num_of_Loan']))):
                below_idx += 1
            
            if (below_idx < len(df) and 
                df.loc[below_idx, 'Customer_ID'] == df.loc[row_idx, 'Customer_ID']):
                fill_value = df.loc[below_idx, 'Num_of_Loan']
                df.loc[row_idx, 'Num_of_Loan'] = fill_value
        
        if pd.isna(df.loc[row_idx, 'Num_of_Loan']):
            df.loc[row_idx, 'Num_of_Loan'] = 0

In [12]:
class TrfCreditScore(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        newx.loc[newx['Credit_Score'] == 'Poor', 'Credit_Score'] = 0
        newx.loc[newx['Credit_Score'] == 'Standard', 'Credit_Score'] = 1
        newx.loc[newx['Credit_Score'] == 'Good', 'Credit_Score'] = 2
        
        newx['Credit_Score'] = newx['Credit_Score'].astype(int)
        return newx

In [14]:
class CorrectOutliersNumOfLoan(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        medians = newx.groupby('Customer_ID')['Num_of_Loan'].median().rename("median").reset_index()
        p75 = newx.groupby('Customer_ID')['Num_of_Loan'].quantile(0.75).rename("p75").reset_index()

        newx = pd.merge(newx, medians, on='Customer_ID')
        newx = pd.merge(newx, p75, on='Customer_ID')

        newx['Num_of_Loan'] = np.where(
            (newx['Num_of_Loan'] > newx['p75']) | (newx['Num_of_Loan'] < 0), 
            newx['median'], 
            newx['Num_of_Loan']
        )
        newx = newx.drop(['median', 'p75'], axis=1)
        
        return newx

In [15]:
class FillNaNumOfDelayedPayment(BaseEstimator, TransformerMixin):
    """
    Para cada dado nulo, preenche com um valor de acordo com a coluna "Delay_from_due_date". 
    A validação levará em conta se a linha do dado nulo tem o mesmo valor de "Delay_from_due_date" de outra linha 
    (que tem o mesmo "Customer_ID", claro). Se sim, o valor que tiver em "Num_of_Delayed_Payment" preencherá a linha com dado nulo.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        nulos = newx['Num_of_Delayed_Payment'].isna()
        df_temp = newx[~nulos].groupby(['Customer_ID', 'Delay_from_due_date']).first().reset_index()

        newx = pd.merge(newx, df_temp[['Customer_ID', 'Delay_from_due_date', 'Num_of_Delayed_Payment']], 
                                 on=['Customer_ID', 'Delay_from_due_date'], 
                                 how='left', suffixes=('', '_fill'))

        newx['Num_of_Delayed_Payment'] = np.where(
            newx['Num_of_Delayed_Payment'].isna(), 
            newx['Num_of_Delayed_Payment_fill'], 
            newx['Num_of_Delayed_Payment']
        )
        
        newx = newx.drop(columns='Num_of_Delayed_Payment_fill')

        mediana_cliente = newx.groupby('Customer_ID')['Num_of_Delayed_Payment'].median()
        newx['Num_of_Delayed_Payment'] = newx.apply(
            lambda row: mediana_cliente[row['Customer_ID']] if pd.isnull(row['Num_of_Delayed_Payment']) 
            else row['Num_of_Delayed_Payment'], axis=1
        )
        
        newx['Num_of_Delayed_Payment'] = newx['Num_of_Delayed_Payment'].astype(int)
        
        return newx

In [16]:
class ReplaceOutliersNumOfDelayedPayment(BaseEstimator, TransformerMixin):
    """
    Identifica valores na coluna 'Num_of_Delayed_Payment' que são maiores que o percentil 99,2
    Para cada um desses valores: encontra o valor correspondente em 'Delay_from_due_date', calcula a mediana de 'Num_of_Delayed_Payment' 
    para todas as linhas que tem esse mesmo valor em 'Delay_from_due_date' e atribui essa mediana ao valor original em 'Num_of_Delayed_Payment' 
    que foi identificado como um outlier.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        outlier = newx['Num_of_Delayed_Payment'] > newx['Num_of_Delayed_Payment'].quantile(0.992)

        for idx, row in newx[outlier].iterrows():
            median_val = newx[
                (newx['Delay_from_due_date'] == row['Delay_from_due_date']) &
                (~outlier)
            ]['Num_of_Delayed_Payment'].median() 
            newx.at[idx, 'Num_of_Delayed_Payment'] = median_val
        return newx


In [17]:
class TrfChangedCreditLimit(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        newx.loc[newx['Changed_Credit_Limit'] == "_", 'Changed_Credit_Limit'] = np.nan
        mediana_cliente = newx.groupby('Customer_ID')['Changed_Credit_Limit'].median()
        
        newx['Changed_Credit_Limit'] = newx.apply(
            lambda row: mediana_cliente[row['Customer_ID']] if pd.isnull(row['Changed_Credit_Limit']) 
            else row['Changed_Credit_Limit'], axis=1
        )
        
        newx['Changed_Credit_Limit'] = newx['Changed_Credit_Limit'].astype(float).round(2)
        newx['Changed_Credit_Limit'] = newx['Changed_Credit_Limit'].abs()
        
        return newx

In [18]:
class TrfNumCreditInquiries(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        mediana = newx.groupby('Customer_ID')['Num_Credit_Inquiries'].median()
        
        newx['Num_Credit_Inquiries'] = newx.apply(
            lambda row: mediana[row['Customer_ID']] if pd.isnull(row['Num_Credit_Inquiries']) 
            else row['Num_Credit_Inquiries'], axis=1
        )
        
        newx['Num_Credit_Inquiries'] = newx.apply(self.replace_outliers, axis=1, mediana=mediana)
        
        return newx
    
    def replace_outliers(self, row, mediana):
        median_value = mediana[row['Customer_ID']]
        if row['Num_Credit_Inquiries'] > median_value * 2:
            return median_value
        else:
            return row['Num_Credit_Inquiries']

In [19]:
class TrfCreditMix(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        newx.loc[newx['Credit_Mix'] == 'Bad', 'Credit_Mix'] = 0
        newx.loc[newx['Credit_Mix'] == 'Standard', 'Credit_Mix'] = 1
        newx.loc[newx['Credit_Mix'] == 'Good', 'Credit_Mix'] = 2

        newx.loc[newx['Credit_Mix'] == "_", 'Credit_Mix'] = np.nan

        mode = newx.groupby('Customer_ID')['Credit_Mix'].apply(lambda x: x.mode().iloc[0])
        
        newx['Credit_Mix'] = newx.apply(
            lambda row: mode[row['Customer_ID']] if pd.isnull(row['Credit_Mix']) 
            else row['Credit_Mix'], axis=1
        )
        
        return newx

In [20]:
class TrfPaymentOfMinAmount(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        
        newx.loc[newx['Payment_of_Min_Amount'] == 'NM', 'Payment_of_Min_Amount'] = 'No'
        newx.loc[newx['Payment_of_Min_Amount'] == 'No', 'Payment_of_Min_Amount'] = 0
        newx.loc[newx['Payment_of_Min_Amount'] == 'Yes', 'Payment_of_Min_Amount'] = 1

        newx['Payment_of_Min_Amount'] = newx['Payment_of_Min_Amount'].astype(int)
        return newx

In [21]:
class ReplaceAnnualIncome(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()

        threshold = newx['Annual_Income'].quantile(0.75)
        mode = newx.groupby('Customer_ID')['Annual_Income'].apply(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
        newx['Annual_Income'] = newx.apply(lambda row: mode[row['Customer_ID']] if 
                                                             row['Annual_Income'] > threshold else row['Annual_Income'], axis=1)
        return newx

In [22]:
class DropAnnualIncome(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold_90 = newx['Annual_Income'].quantile(0.90)
        return newx[newx['Annual_Income'] <= threshold_90]

In [23]:
class ReplaceMonthlyInhandSalary(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold = newx['Monthly_Inhand_Salary'].quantile(0.75)
        mode = newx.groupby('Customer_ID')['Monthly_Inhand_Salary'].apply(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
        newx['Monthly_Inhand_Salary'] = newx.apply(lambda row: mode[row['Customer_ID']] if 
                                                                     row['Monthly_Inhand_Salary'] > threshold else row['Monthly_Inhand_Salary'], axis=1)
        return newx

In [24]:
class DropMonthlyInhandSalary(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold_90 = newx['Monthly_Inhand_Salary'].quantile(0.90)
        return newx[newx['Monthly_Inhand_Salary'] <= threshold_90]

In [25]:
class ReplaceAmountInvestedMonthly(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold = newx['Amount_invested_monthly'].quantile(0.75)
        mode = newx.groupby('Customer_ID')['Amount_invested_monthly'].apply(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
        newx['Amount_invested_monthly'] = newx.apply(lambda row: mode[row['Customer_ID']] if 
                                                                       row['Amount_invested_monthly'] > threshold else row['Amount_invested_monthly'], axis=1)
        return newx

In [26]:
class DropAmountInvestedMonthly(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold_98 = newx['Amount_invested_monthly'].quantile(0.98)
        return newx[newx['Amount_invested_monthly'] <= threshold_98]

In [27]:
class ReplaceTotalEMIPerMonth(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold = newx['Total_EMI_per_month'].quantile(0.75)
        mode = newx.groupby('Customer_ID')['Total_EMI_per_month'].apply(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
        newx['Total_EMI_per_month'] = newx.apply(lambda row: mode[row['Customer_ID']] if 
                                                                   row['Total_EMI_per_month'] > threshold else row['Total_EMI_per_month'], axis=1)
        return newx

In [28]:
class DropTotalEMIPerMonth(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        threshold_99 = newx['Total_EMI_per_month'].quantile(0.99)
        return newx[newx['Total_EMI_per_month'] <= threshold_99]

In [29]:
class DropColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        newx = X.copy()
        columns_drop = ['ID', 'Years_Credit_History_Age', 'Months_Credit_History_Age', 
                           'Credit_History_Age', 'Type_of_Loan', 'SSN', 'Credit_Mix', 'Month', 'Occupation', 'Customer_ID', 'Name', 'Annual_Income']
        newx = newx.drop(columns_drop, axis=1)
        
        return newx