In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft
import featuretools.variable_types as vtypes

# Utilities
import sys
import psutil
import os

from timeit import default_timer as timer

In [2]:
def convert_types(df):
    """Convert pandas data types for memory reduction."""
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif set(df[c].unique()) == {0, 1}:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    return df

In [3]:
# Read in the datasets and replace the anomalous values
app_train = pd.read_csv('../input/application_train.csv').replace({365243: np.nan})
app_test = pd.read_csv('../input/application_test.csv').replace({365243: np.nan})
bureau = pd.read_csv('../input/bureau.csv').replace({365243: np.nan})
bureau_balance = pd.read_csv('../input/bureau_balance.csv').replace({365243: np.nan})
cash = pd.read_csv('../input/POS_CASH_balance.csv').replace({365243: np.nan})
credit = pd.read_csv('../input/credit_card_balance.csv').replace({365243: np.nan})
previous = pd.read_csv('../input/previous_application.csv').replace({365243: np.nan})
installments = pd.read_csv('../input/installments_payments.csv').replace({365243: np.nan})

app_test['TARGET'] = np.nan

# Join together training and testing
app = app_train.append(app_test, ignore_index = True, sort = True)
number_clients = app.shape[0]

# Need `SK_ID_CURR` in every dataset
bureau_balance = bureau_balance.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], 
                                      on = 'SK_ID_BUREAU', how = 'left')

print(f"""Total memory before converting types: \
{round(np.sum([x.memory_usage().sum() / 1e9 for x in 
[app, bureau, bureau_balance, cash, credit, previous, installments]]), 2)} gb.""")

# Convert types to reduce memory usage
app = convert_types(app)
bureau = convert_types(bureau)
bureau_balance = convert_types(bureau_balance)
cash = convert_types(cash)
credit = convert_types(credit)
previous = convert_types(previous)
installments = convert_types(installments)

print(f"""Total memory after converting types: \
{round(np.sum([x.memory_usage().sum() / 1e9 for x in 
[app, bureau, bureau_balance, cash, credit, previous, installments]]), 2)} gb.""")

# Set the index for locating
for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:
    dataset.set_index('SK_ID_CURR', inplace = True)

Total memory before converting types: 4.38 gb.
Total memory after converting types: 2.03 gb.


In [4]:
print('Object memory usage.')
print(bureau['CREDIT_TYPE'].astype('object').memory_usage() / 1e9, 'gb')

print('Category memory usage.')
print(bureau['CREDIT_TYPE'].astype('category').memory_usage() / 1e9, 'gb')

print('Length of data: ', bureau.shape[0])
print('Number of unique categories: ', bureau['CREDIT_TYPE'].nunique())

Object memory usage.
0.027462848 gb
Category memory usage.
0.015448612 gb
Length of data:  1716428
Number of unique categories:  15


In [3]:
def create_partition(user_list, partition):
    """Creates and saves a dataset with only the users in `user_list`."""
    
    # Make the directory
    directory = '../input/partitions/p%d' % (partition + 1)
    if os.path.exists(directory):
        return
    
    else:
        os.makedirs(directory)
        
        # Subset based on user list
        app_subset = app[app.index.isin(user_list)].copy().reset_index()
        bureau_subset = bureau[bureau.index.isin(user_list)].copy().reset_index()

        # Drop SK_ID_CURR from bureau_balance, cash, credit, and installments
        bureau_balance_subset = bureau_balance[bureau_balance.index.isin(user_list)].copy().reset_index(drop = True)
        cash_subset = cash[cash.index.isin(user_list)].copy().reset_index(drop = True)
        credit_subset = credit[credit.index.isin(user_list)].copy().reset_index(drop = True)
        previous_subset = previous[previous.index.isin(user_list)].copy().reset_index()
        installments_subset = installments[installments.index.isin(user_list)].copy().reset_index(drop = True)
        

        # Save data to the directory
        app_subset.to_csv('%s/app.csv' % directory, index = False)
        bureau_subset.to_csv('%s/bureau.csv' % directory, index = False)
        bureau_balance_subset.to_csv('%s/bureau_balance.csv' % directory, index = False)
        cash_subset.to_csv('%s/cash.csv' % directory, index = False)
        credit_subset.to_csv('%s/credit.csv' % directory, index = False)
        previous_subset.to_csv('%s/previous.csv' % directory, index = False)
        installments_subset.to_csv('%s/installments.csv' % directory, index = False)

        if partition % 10 == 0:
            print('Saved all files in partition {} to {}.'.format(partition + 1, directory))



In [6]:
# Break into 104 chunks
chunk_size = app.shape[0] // 103

# Construct an id list
id_list = [list(app.iloc[i:i+chunk_size].index) for i in range(0, app.shape[0], chunk_size)]

In [7]:
from itertools import chain

# Sanity check that we have not missed any ids
print('Number of ids in id_list:         {}.'.format(len(list(chain(*id_list)))))
print('Total length of application data: {}.'.format(len(app)))

Number of ids in id_list:         356255.
Total length of application data: 356255.


In [8]:
start = timer()
for i, ids in enumerate(id_list):
    # Create a partition based on the ids
    create_partition(ids, i)
    
end = timer()
print(f'Partitioning took {round(end - start)} seconds.')

Saved all files in partition 1 to ../input/partitions/p1.
Saved all files in partition 11 to ../input/partitions/p11.
Saved all files in partition 21 to ../input/partitions/p21.
Saved all files in partition 31 to ../input/partitions/p31.
Saved all files in partition 41 to ../input/partitions/p41.
Saved all files in partition 51 to ../input/partitions/p51.
Saved all files in partition 61 to ../input/partitions/p61.
Saved all files in partition 71 to ../input/partitions/p71.
Saved all files in partition 81 to ../input/partitions/p81.
Saved all files in partition 91 to ../input/partitions/p91.
Saved all files in partition 101 to ../input/partitions/p101.
Partitioning took 1358 seconds.


In [2]:
feature_defs = ft.load_features('../input/features.txt')
print(len(feature_defs))

1820


In [7]:
def entityset_from_partition(path):
    """Create an EntitySet from a partition of data specified as a path.
       Returns a dictionary with the entityset and the number used for saving the feature matrix."""
    
    partition_num = int(path[21:])
    
    # Read in data
    app = pd.read_csv('%s/app.csv' % path)
    bureau = pd.read_csv('%s/bureau.csv' % path)
    bureau_balance = pd.read_csv('%s/bureau_balance.csv' % path)
    previous = pd.read_csv('%s/previous.csv' % path)
    credit = pd.read_csv('%s/credit.csv' % path)
    installments = pd.read_csv('%s/installments.csv' % path)
    cash = pd.read_csv('%s/cash.csv' % path)
    
    # Empty entityset
    es = ft.EntitySet(id = 'clients')
    
    # Entities with a unique index
    es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR')

    es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

    es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV')

    # Entities that do not have a unique index
    es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                                  make_index = True, index = 'bureaubalance_index')

    es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                                  make_index = True, index = 'cash_index')

    es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                                  make_index = True, index = 'installments_index')

    es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                                  make_index = True, index = 'credit_index')
    
    # Relationship between app_train and bureau
    r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

    # Relationship between bureau and bureau balance
    r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

    # Relationship between current app and previous apps
    r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

    # Relationships between previous apps and cash, installments, and credit
    r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
    r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
    r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])
    
    # Add in the defined relationships
    es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                               r_previous_cash, r_previous_installments, r_previous_credit])

    return ({'es': es, 'num': partition_num})


In [8]:
#Just a test of the function above
es1_dict = entityset_from_partition('../input/partitions/p1')
es1_dict['es']

Entityset: clients
  Entities:
    app [Rows: 3458, Columns: 122]
    bureau [Rows: 16097, Columns: 17]
    previous [Rows: 16204, Columns: 37]
    bureau_balance [Rows: 166374, Columns: 4]
    cash [Rows: 96632, Columns: 8]
    installments [Rows: 129130, Columns: 8]
    credit [Rows: 35694, Columns: 23]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV

In [9]:
def feature_matrix_from_entityset(es_dict, feature_defs, return_fm = False):
    """Run deep feature synthesis from an entityset and feature definitions. 
    Saves feature matrix based on partition.""" 
    
    # Extract the entityset
    es = es_dict['es']
    
    # Calculate the feature matrix and save
    feature_matrix = ft.calculate_feature_matrix(feature_defs, 
                                                 entityset=es, 
                                                 n_jobs = 1, 
                                                 verbose = 0,
                                                 chunk_size = es['app'].df.shape[0])
    
    feature_matrix.to_csv('../input/fm/p%d_fm.csv' % es_dict['num'], index = True)
    
    if return_fm:
        return feature_matrix

In [12]:
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

start = timer()
fm1 = feature_matrix_from_entityset(es1_dict, feature_defs, return_fm = True)
end = timer()
fm1.shape

(3458, 1820)

In [13]:
print(f'Computing one feature matrix took {round(end - start, 2)} seconds.')

Computing one feature matrix took 65.0 seconds.


In [None]:
import gc

# Free up all system memory
gc.enable()
del app, bureau, bureau_balance, previous, credit, cash, installments
gc.collect()

In [15]:
import dask.bag as db
from dask.distributed import Client

# Use all 8 cores
client = Client(processes = True)

client.ncores()

{'tcp://127.0.0.1:32865': 1,
 'tcp://127.0.0.1:36381': 1,
 'tcp://127.0.0.1:43447': 1,
 'tcp://127.0.0.1:45947': 1}

In [16]:
paths = ['../input/partitions/p%d' %  i for i in range(1, 105)]
paths[:8]

['../input/partitions/p1',
 '../input/partitions/p2',
 '../input/partitions/p3',
 '../input/partitions/p4',
 '../input/partitions/p5',
 '../input/partitions/p6',
 '../input/partitions/p7',
 '../input/partitions/p8']

In [17]:
# Create a bag object
b = db.from_sequence(paths)

# Map entityset function
b = b.map(entityset_from_partition)

# Map feature matrix function
b = b.map(feature_matrix_from_entityset, feature_defs = feature_defs)
    
b

dask.bag<feature..., npartitions=104>

In [None]:
overall_start = timer()
b.compute()
overall_end = timer()

print(f"Total Time Elapsed: {round(overall_end - overall_start, 2)} seconds.")

In [3]:
# Base directory for feature matrices
base = '../input/fm/'
fm_paths = [base + p for p in os.listdir(base) if 'fm.csv' in p]

#First we read in the dataframes and place them in a list

read_start = timer()
fms = [pd.read_csv(path) for path in fm_paths]
read_end = timer()

print(f'Reading in {len(fms)} feature matrices took {round(read_end - read_start)} seconds.')

#Then we concatenate all the dataframes in the list along the first axis - meaning that we add the rows to each other.
concat_start = timer()
feature_matrix = pd.concat(fms, axis = 0)
concat_end = timer()

print('Final Feature Matrix Shape:', feature_matrix.shape)
print(f"Concatenation time: {round(concat_end - concat_start, 2)} seconds.")

Reading in 104 feature matrices took 143 seconds.
Final Feature Matrix Shape: (356255, 1821)
Concatenation time: 6.18 seconds.


In [24]:
feature_matrix.reset_index(inplace = True)
feature_matrix.to_csv('../input/feature_matrix.csv', index = False)
feature_matrix.head()

Unnamed: 0,index,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,...,PERCENTILE(MIN(installments.DAYS_ENTRY_PAYMENT)),PERCENTILE(MIN(installments.AMT_INSTALMENT)),PERCENTILE(MIN(installments.AMT_PAYMENT)),PERCENTILE(MEAN(installments.NUM_INSTALMENT_VERSION)),PERCENTILE(MEAN(installments.NUM_INSTALMENT_NUMBER)),PERCENTILE(MEAN(installments.DAYS_INSTALMENT)),PERCENTILE(MEAN(installments.DAYS_ENTRY_PAYMENT)),PERCENTILE(MEAN(installments.AMT_INSTALMENT)),PERCENTILE(MEAN(installments.AMT_PAYMENT)),PERCENTILE(COUNT(installments))
0,0,224374,27643.5,832500.0,832500.0,195750.0,0.0,0.0,1.0,2.0,...,0.183164,0.123343,0.287388,0.095899,0.872649,0.332717,0.328708,0.182855,0.199198,0.826634
1,1,224375,46899.0,1227901.5,1129500.0,202500.0,0.0,0.0,0.0,0.0,...,0.021122,0.257478,0.273204,0.656491,0.711687,0.064755,0.065063,0.409497,0.384521,0.64734
2,2,224376,16875.0,337500.0,337500.0,112500.0,0.0,0.0,0.0,1.0,...,0.53099,0.665742,0.441875,0.525439,0.790163,0.372803,0.372495,0.284613,0.243293,0.712406
3,3,224377,29110.5,898326.0,643500.0,112500.0,0.0,0.0,2.0,3.0,...,0.555196,0.415665,0.395621,0.848751,0.593124,0.486278,0.486586,0.828862,0.850139,0.603383
4,4,224378,21888.0,450000.0,450000.0,450000.0,0.0,0.0,0.0,0.0,...,0.576781,0.678693,0.058434,0.340888,0.819303,0.62288,0.624113,0.566142,0.305273,0.826634


In [26]:
#Empty memory
import gc
gc.enable()
del feature_matrix
gc.collect()

183

In [None]:
b.close()
b.terminate()
b.join()

In [4]:
# Sampling 10% of the original data
train = feature_matrix[feature_matrix['TARGET'].notnull()].sample(frac = 0.1, random_state = 50)


In [21]:
for col in ['SUM(bureau.PREVIOUS_OTHER_LOAN_RATE)', 'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Closed)',
            'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Active)', 'SUM(bureau_balance.bureau.PREVIOUS_OTHER_LOAN_RATE)']:
    try:
        train[col] = train[col].astype(np.float32)
    except:
        print(f'{col} not in data')
    
for col in train:
    if train[col].dtype == 'bool':
        train[col] = train[col].astype(np.uint8)

SUM(bureau.PREVIOUS_OTHER_LOAN_RATE) not in data
SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Closed) not in data
SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Active) not in data
SUM(bureau_balance.bureau.PREVIOUS_OTHER_LOAN_RATE) not in data


In [22]:
train = pd.get_dummies(train)
n_features_start = train.shape[1] - 2
train.shape

(30751, 2094)

In [23]:
#Columns with duplicate items
x, idx, inv, counts = np.unique(train, axis = 1, return_index = True, return_inverse=True, return_counts=True)
train = train.iloc[:, idx]
n_non_unique_columns = n_features_start - train.shape[1] - 2
train.shape

(30751, 1806)

In [24]:
#Remove columns with >=90% missing values
missing_threshold = 90

# Find missing and percentage
missing = pd.DataFrame(train.isnull().sum())
missing['percent'] = 100 * (missing[0] / train.shape[0])
missing.sort_values('percent', ascending = False, inplace = True)

# Missing above threshold
missing_cols = list(missing[missing['percent'] > missing_threshold].index)
n_missing_cols = len(missing_cols)

train = train[[x for x in train if x not in missing_cols]]
train.shape

(30751, 1788)

In [29]:
#Remove zero variance columns
unique_counts = pd.DataFrame(train.nunique()).sort_values(0, ascending = True)
zero_variance_cols = list(unique_counts[unique_counts[0] == 1].index)
n_zero_variance_cols = len(zero_variance_cols)

train = train[[x for x in train if x not in zero_variance_cols]]
train.shape

(30751, 1769)

In [30]:
#Remove target columns (accidentally left in earlier code)
for col in train:
    if 'TARGET' in col:
        print(col)
        
train.drop(columns = 'PERCENTILE(TARGET)', inplace = True)

TARGET
PERCENTILE(TARGET)


In [31]:
#Remove columns with high collinearity with other column
correlation_threshold = 0.95

corr_matrix = train.corr()

# Extract the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

# Select the features with correlations above the threshold
# Need to use the absolute value
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

In [32]:
train = train[[x for x in train if x not in to_drop]]
n_collinear = len(to_drop)
train.shape

(30751, 1057)

In [33]:
total_removed = n_non_unique_columns + n_missing_cols + n_zero_variance_cols + n_collinear + 1
print('Total columns removed: ', total_removed)

Total columns removed:  1033


In [34]:
train.to_csv('../input/feature_matrix_sample.csv', index = False)

In [4]:
def feature_selection(feature_matrix, missing_threshold=90, correlation_threshold=0.95):
    """Feature selection for a dataframe."""
    
    feature_matrix = pd.get_dummies(feature_matrix)
    n_features_start = feature_matrix.shape[1]
    print('Original shape: ', feature_matrix.shape)

    _, idx = np.unique(feature_matrix, axis = 1, return_index = True)
    feature_matrix = feature_matrix.iloc[:, idx]
    n_non_unique_columns = n_features_start - feature_matrix.shape[1]
    print('{}  non-unique valued columns.'.format(n_non_unique_columns))

    # Find missing and percentage
    missing = pd.DataFrame(feature_matrix.isnull().sum())
    missing['percent'] = 100 * (missing[0] / feature_matrix.shape[0])
    missing.sort_values('percent', ascending = False, inplace = True)

    # Missing above threshold
    missing_cols = list(missing[missing['percent'] > missing_threshold].index)
    n_missing_cols = len(missing_cols)

    # Remove missing columns
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in missing_cols]]
    print('{} missing columns with threshold: {}.'.format(n_missing_cols,
                                                                        missing_threshold))
    
    # Zero variance
    unique_counts = pd.DataFrame(feature_matrix.nunique()).sort_values(0, ascending = True)
    zero_variance_cols = list(unique_counts[unique_counts[0] == 1].index)
    n_zero_variance_cols = len(zero_variance_cols)

    # Remove zero variance columns
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in zero_variance_cols]]
    print('{} zero variance columns.'.format(n_zero_variance_cols))
    
    # Correlations
    corr_matrix = feature_matrix.corr()

    # Extract the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

    n_collinear = len(to_drop)
    
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in to_drop]]
    print('{} collinear columns removed with threshold: {}.'.format(n_collinear,
                                                                          correlation_threshold))
    
    total_removed = n_non_unique_columns + n_missing_cols + n_zero_variance_cols + n_collinear
    
    print('Total columns removed: ', total_removed)
    print('Shape after feature selection: {}.'.format(feature_matrix.shape))
    return feature_matrix

In [36]:
fm = pd.read_csv('../input/application_train.csv')
fm = fm.sample(frac = 0.1, random_state = 50)

fm = pd.get_dummies(fm)
fm.shape

(30751, 244)

In [37]:
fm = feature_selection(fm, 90, 0.95)
fm.head()

Original shape:  (30751, 244)
0  non-unique valued columns.
0 missing columns with threshold: 90.
2 zero variance columns.
37 collinear columns removed with threshold: 0.95.
Total columns removed:  39
Shape after feature selection: (30751, 205).


Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_12,FLAG_DOCUMENT_2,ORGANIZATION_TYPE_Trade: type 5,NAME_INCOME_TYPE_Maternity leave,FLAG_DOCUMENT_7,...,BASEMENTAREA_MEDI,ELEVATORS_MODE,COMMONAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,FLOORSMIN_AVG,YEARS_BUILD_AVG
77158,-14017,-3747,-2384.0,-1046,0.0,0,0,0,0,0,...,,,,,,,,,,
306191,-16520,-4275,-3198.0,-82,-1542.0,0,0,0,0,0,...,,,,,,,,,,
64916,-20741,365243,-1882.0,-4296,0.0,0,0,0,0,0,...,,,,,,,,,,
81133,-9685,-318,-378.0,-1763,-1090.0,0,0,0,0,0,...,,,,,,,,,,
231607,-20891,-413,-3154.0,-3595,-1696.0,0,0,0,0,0,...,,,,,,,,,,


In [38]:
fm.to_csv('../input/features_default_sample.csv', index = False)

In [5]:
# Read in sample and full data
#sample = pd.read_csv('../input/feature_matrix_sample.csv')
#fm = pd.read_csv('../input/feature_matrix.csv')

print(feature_matrix.shape)

# One hot encoding
cat = pd.get_dummies(feature_matrix.select_dtypes('object'))

# Convert the column types
for col in feature_matrix:
    if feature_matrix[col].dtype == 'bool':
        feature_matrix[col] = fm[col].astype(np.uint8)
        
# Add the one-hot encoded columns
feature_matrix = feature_matrix.select_dtypes(['number'])
feature_matrix = feature_matrix.concat([fm, cat], axis = 1)
feature_matrix.shape

(356255, 1821)


NameError: name 'fm' is not defined

In [10]:
#Empty memory
import gc
gc.enable()
del fm
gc.collect()

134