# Bank Marketing Dataset - Clustering
## 1.Data-preprocessing

* Krzysztof Sawicki
* Michał Geneja 
* Natalia Safiejko

[Dane](https://www.kaggle.com/datasets/hariharanpavan/bank-marketing-dataset-analysis-classification?resource=download)

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

np.random.seed = 23

In [2]:
df = pd.read_csv('data/bank.csv')

## 1. Zapoznanie się ze zbiorem danych

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


In [5]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [6]:
df.isna().sum().sum()

0

In [7]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

## 2.Podział danych 

Podziału danych dokonaliśmy przy pomocy poniższej funkcji:

In [3]:
def split_dataset(df: pd.DataFrame) -> tuple:
    """
    Function to split dataset into training and validation subsets
    :param df: Data frame to be split
    :return: Tuple of training and validation datasets
    """
    x_train1, x_test = train_test_split(df, test_size=0.3, random_state=10)
    x_train, x_valid = train_test_split(x_train1, test_size=0.3, random_state=10)
    return x_train, x_valid, x_test

x_train, x_valid, x_test = split_dataset(df)

## 3. Transformacja data frame

Zamiana słów na liczby

In [4]:
def switch_for_numbers(column):
    
    """
    Function to switch strings into numbers
    :param column: column of dataframe with strings
    :return: Modified column
    """
    unique_value = list(set(column))  
    dic = {value: indeks for indeks, value in enumerate(unique_value)}  
    

    numbers_column = [dic[value] for value in column]
    
    return numbers_column

In [5]:
def perform_one_hot_encoding(df, column_list):
    for column in column_list:
        encoded_df = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, encoded_df], axis=1)
    return df

In [6]:
columns_to_encode = ['job', 'marital', 'education','contact', 'poutcome']
x_train = perform_one_hot_encoding(x_train, columns_to_encode)
x_train = x_train.drop(columns_to_encode, axis = 'columns')
x_test = perform_one_hot_encoding(x_test, columns_to_encode)
x_test = x_test.drop(columns_to_encode, axis = 'columns')

In [7]:
x_train.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'month',
       'duration', 'campaign', 'pdays', 'previous', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'poutcome_failure', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown'],
      dtype='object')

In [8]:
x_train.loc[:,"default"].unique()

array(['no', 'yes'], dtype=object)

In [9]:
x_train.loc[:,"default"] = [0 if value == 'no' else 1 for value in x_train.loc[:,"default"]]
x_test.loc[:,"default"] = [0 if value == 'no' else 1 for value in x_test.loc[:,"default"]]

In [10]:
x_train.loc[:,"housing"].unique()

array(['no', 'yes'], dtype=object)

In [11]:
x_train.loc[:,"housing"] = [0 if value == 'no' else 1 for value in x_train.loc[:,"housing"]]
x_test.loc[:,"housing"] = [0 if value == 'no' else 1 for value in x_test.loc[:,"housing"]]

In [12]:
x_train.loc[:,"loan"].unique()

array(['no', 'yes'], dtype=object)

In [13]:
x_train.loc[:,"loan"] = [0 if value == 'no' else 1 for value in x_train.loc[:,"loan"]]
x_test.loc[:,"loan"] = [0 if value == 'no' else 1 for value in x_test.loc[:,"loan"]]

In [14]:
x_train.loc[:,"month"].unique()

array(['aug', 'jun', 'feb', 'jul', 'may', 'nov', 'sep', 'apr', 'mar',
       'oct', 'jan', 'dec'], dtype=object)

In [15]:
month = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
x_train.loc[:,"month"] = [month.index(name.lower()) + 1 for name in x_train.loc[:,"month"]]
x_test.loc[:,"month"] = [month.index(name.lower()) + 1 for name in x_test.loc[:,"month"]]

In [16]:
x_train.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
23000,54,0,965,0,0,26,8,345,4,-1,...,1,0,0,1,0,0,0,0,0,1
23218,39,0,0,0,0,27,8,147,4,-1,...,0,1,0,1,0,0,0,0,0,1
21027,36,0,306,0,0,14,8,111,2,-1,...,0,1,0,1,0,0,0,0,0,1
9895,36,0,13698,0,0,9,6,136,1,-1,...,1,0,0,0,0,1,0,0,0,1
10688,37,0,264,1,1,16,6,220,1,-1,...,0,1,0,0,0,1,0,0,0,1


In [17]:
x_train.describe()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
count,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,...,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0
mean,40.898655,0.017831,1385.776363,0.556022,0.160347,15.774828,6.140845,258.690051,2.775235,39.56952,...,0.513678,0.294962,0.03941,0.649196,0.064103,0.286701,0.105273,0.040132,0.033676,0.820919
std,10.676887,0.132341,3197.83555,0.496863,0.366936,8.319592,2.395206,259.505514,3.087866,99.515152,...,0.499824,0.456036,0.194572,0.477232,0.244941,0.452231,0.306911,0.196273,0.180399,0.383429
min,18.0,0.0,-6847.0,0.0,0.0,1.0,1.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,0.0,70.0,0.0,0.0,8.0,5.0,103.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,39.0,0.0,453.0,1.0,0.0,16.0,6.0,179.0,2.0,-1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,48.0,0.0,1436.25,1.0,0.0,21.0,8.0,319.25,3.0,-1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,95.0,1.0,102127.0,1.0,1.0,31.0,12.0,3881.0,63.0,842.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Outliery

Procesowi usunięcia outlierów nie będą podlegały takie kolumny jak date czy month, gdyż posiadają one poprawne dane

In [18]:
def substitute_outliers(df: pd.DataFrame, median: dict[str:float] = None, upper_lim: dict[str:float] = None, lower_lim: dict[str:float] = None) -> tuple:
    """
    Function to substitute outliers with median value
    :param df: Data frame to modify
    :return: Modified data frame
    """
    if median is None:
        median = {}
    if upper_lim is None:
        upper_lim = {}
    if lower_lim is None:
        lower_lim = {}
    for col in df:
        if median.get(col, None) is None:
            median[col] = df[col].median()
        if upper_lim.get(col, None) is None:
            upper_lim[col] = df[col].quantile(.995)
        if lower_lim.get(col, None) is None:
            lower_lim[col] = df[col].quantile(.005)
        outliers = (df[col] > upper_lim[col]) | (df[col] < lower_lim[col])
        df[col][outliers]=np.nan
        df.replace({np.nan:median[col]}, inplace=True)
        
    return df, median, upper_lim, lower_lim

Wybieramy kolumny, które w oryginalnej ramce danych były numeryczne

In [19]:
numeric_columns = ['balance','duration','campaign','pdays', 'previous']

In [20]:
x_train.loc[:,numeric_columns], median, upper_lim, lower_lim = substitute_outliers(x_train.loc[:,numeric_columns])
x_test.loc[:,numeric_columns], median1, upper_lim2, lower_lim3 = substitute_outliers(x_test.loc[:,numeric_columns],median, upper_lim, lower_lim)
x_train.describe()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
count,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,...,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0
mean,40.898655,0.017831,1245.007629,0.556022,0.160347,15.774828,6.140845,250.420414,2.661836,36.905923,...,0.513678,0.294962,0.03941,0.649196,0.064103,0.286701,0.105273,0.040132,0.033676,0.820919
std,10.676887,0.132341,2187.268097,0.496863,0.366936,8.319592,2.395206,227.150178,2.542509,92.831127,...,0.499824,0.456036,0.194572,0.477232,0.244941,0.452231,0.306911,0.196273,0.180399,0.383429
min,18.0,0.0,-892.0,0.0,0.0,1.0,1.0,8.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,0.0,76.0,0.0,0.0,8.0,5.0,104.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,39.0,0.0,453.0,1.0,0.0,16.0,6.0,179.0,2.0,-1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,48.0,0.0,1402.0,1.0,0.0,21.0,8.0,316.0,3.0,-1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,95.0,1.0,19313.0,1.0,1.0,31.0,12.0,1531.0,21.0,392.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
x_train.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'month',
       'duration', 'campaign', 'pdays', 'previous', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'poutcome_failure', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown'],
      dtype='object')

### Normalizacja

In [24]:
def normalize_dataframe(df, df2):
    dictionary = {}
    normalized_df = df.copy()
    normalized_df2 = df2.copy()
    for column in df.columns:
        column_min = df[column].min()
        column_max = df[column].max()
        values = (column_min, column_max)
        dictionary[column] = values
        normalized_df[column] = (df[column] - column_min) / (column_max - column_min)
        normalized_df2[column] = (df2[column] - column_min) / (column_max - column_min)
    return normalized_df, normalized_df2, dictionary

In [25]:
x_train, x_test, decode_dict = normalize_dataframe(x_train, x_test)
x_train.describe()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
count,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,...,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0,22152.0
mean,0.297385,0.017831,0.105766,0.556022,0.160347,0.492494,0.46735,0.159173,0.083092,0.096453,...,0.513678,0.294962,0.03941,0.649196,0.064103,0.286701,0.105273,0.040132,0.033676,0.820919
std,0.138661,0.132341,0.108254,0.496863,0.366936,0.27732,0.217746,0.149147,0.127125,0.236212,...,0.499824,0.456036,0.194572,0.477232,0.244941,0.452231,0.306911,0.196273,0.180399,0.383429
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.194805,0.0,0.047909,0.0,0.0,0.233333,0.363636,0.063033,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.272727,0.0,0.066568,1.0,0.0,0.5,0.454545,0.112278,0.05,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.38961,0.0,0.113536,1.0,0.0,0.666667,0.636364,0.202232,0.1,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
print(decode_dict)

{'age': (18, 95), 'default': (0, 1), 'balance': (-892.0, 19313.0), 'housing': (0, 1), 'loan': (0, 1), 'day': (1, 31), 'month': (1, 12), 'duration': (8.0, 1531.0), 'campaign': (1.0, 21.0), 'pdays': (-1.0, 392.0), 'previous': (0.0, 12.0), 'job_admin.': (0, 1), 'job_blue-collar': (0, 1), 'job_entrepreneur': (0, 1), 'job_housemaid': (0, 1), 'job_management': (0, 1), 'job_retired': (0, 1), 'job_self-employed': (0, 1), 'job_services': (0, 1), 'job_student': (0, 1), 'job_technician': (0, 1), 'job_unemployed': (0, 1), 'job_unknown': (0, 1), 'marital_divorced': (0, 1), 'marital_married': (0, 1), 'marital_single': (0, 1), 'education_primary': (0, 1), 'education_secondary': (0, 1), 'education_tertiary': (0, 1), 'education_unknown': (0, 1), 'contact_cellular': (0, 1), 'contact_telephone': (0, 1), 'contact_unknown': (0, 1), 'poutcome_failure': (0, 1), 'poutcome_other': (0, 1), 'poutcome_success': (0, 1), 'poutcome_unknown': (0, 1)}


## Korelacja

In [27]:
corr = x_train.corr('spearman')
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
age,1.0,-0.028166,0.100179,-0.160468,-0.006432,-0.010088,0.124149,-0.037275,0.039654,-0.021261,-0.015606,-0.050351,-0.026716,0.038586,0.094742,-0.013693,0.338458,-0.005884,-0.067012,-0.211229,-0.046904,0.004629,0.057154,0.171186,0.309641,-0.457719,0.19058,-0.087226,-0.080398,0.06089,-0.070036,0.141835,-0.002914,-0.006077,-0.033883,0.012779,0.016197
default,-0.028166,1.0,-0.1487,0.001628,0.077778,0.0105,0.012526,-0.010772,0.013061,-0.032033,-0.034523,-0.008701,0.012355,0.027294,-0.004184,-0.010818,-0.00996,0.007184,0.00239,-0.012708,0.000786,-0.000516,-0.003189,0.016467,-0.023955,0.014352,-0.012373,0.024636,-0.01983,0.006019,-0.011746,-0.021337,0.023952,-0.021766,-0.011909,-0.023263,0.034463
balance,0.100179,-0.1487,1.0,-0.059224,-0.114012,0.004477,0.05335,0.043954,-0.023666,0.061821,0.073075,-0.019848,-0.050931,-0.021751,0.002842,0.070015,0.054894,0.017586,-0.049048,0.013935,-0.013289,0.015389,0.019978,-0.042322,0.034272,-0.007236,-0.019298,-0.076197,0.089678,0.021156,0.025753,0.050714,-0.054645,0.034998,0.024332,0.074292,-0.075422
housing,-0.160468,0.001628,-0.059224,1.0,0.046307,-0.024929,-0.265581,0.001732,-0.040009,0.089478,0.065559,0.040924,0.174824,0.013956,-0.077583,-0.059457,-0.158754,-0.026282,0.072607,-0.088849,-0.01631,-0.042519,-0.075774,-0.002614,0.010145,-0.009168,0.013776,0.102348,-0.101222,-0.05109,-0.157861,-0.089973,0.21532,0.11053,0.048467,-0.093576,-0.069256
loan,-0.006432,0.077778,-0.114012,0.046307,1.0,0.006252,0.02837,-0.015385,0.008705,-0.026459,-0.030386,0.036097,0.02324,0.042842,-0.023977,-0.031905,-0.015694,-0.00398,0.028399,-0.059142,0.007303,-0.041548,-0.036802,0.020084,0.034998,-0.052256,-0.016015,0.066563,-0.040118,-0.047413,0.019091,-0.011398,-0.013973,-0.000372,-0.012254,-0.050208,0.030193
day,-0.010088,0.0105,0.004477,-0.024929,0.006252,1.0,0.121377,-0.051271,0.134934,-0.096409,-0.089901,-0.008285,-0.021038,-0.000782,0.002161,0.026722,-0.01595,0.006534,-0.010415,-0.01528,0.028218,-0.008484,-0.018443,0.002699,0.005921,-0.008346,-0.028035,-0.002058,0.023719,0.001419,0.018652,0.016487,-0.028613,-0.072172,-0.04164,-0.028904,0.092683
month,0.124149,0.012526,0.05335,-0.265581,0.02837,0.121377,1.0,-0.046158,0.111566,-0.132711,-0.097956,-0.049209,-0.110109,0.021577,0.045242,0.098295,0.037908,0.023623,-0.047902,-0.021745,0.054333,-0.048774,0.021757,0.007173,0.071385,-0.082635,-0.032007,-0.068489,0.104918,-0.010917,0.182398,0.03257,-0.210122,-0.0987,-0.071899,0.025526,0.103798
duration,-0.037275,-0.010772,0.043954,0.001732,-0.015385,-0.051271,-0.046158,1.0,-0.111742,0.031436,0.041235,-0.007345,0.005214,-0.005809,-0.005235,-0.015136,0.034064,0.000116,-0.000875,0.003014,-0.006099,0.021209,-0.006397,-0.003913,-0.027454,0.032599,-0.011996,0.015367,-0.007682,0.000659,0.040441,-0.049342,-0.015952,-0.005674,-0.001735,0.09061,-0.037202
campaign,0.039654,0.013061,-0.023666,-0.040009,0.008705,0.134934,0.111566,-0.111742,1.0,-0.106379,-0.116745,-0.042584,0.015035,0.003349,0.020129,0.024209,-0.026079,0.000298,-0.015557,-0.030999,0.028749,-0.018847,0.005254,-0.010958,0.05635,-0.053451,0.015974,-0.019843,0.011935,-0.00647,-0.015841,0.062284,-0.017018,-0.096903,-0.011867,-0.071367,0.117217
pdays,-0.021261,-0.032033,0.061821,0.089478,-0.026459,-0.096409,-0.132711,0.031436,-0.106379,1.0,0.956678,0.025736,-0.008837,-0.011451,-0.030798,-0.010965,0.023358,0.006594,-0.003075,0.040211,-0.00326,-0.011555,-0.011517,0.000662,-0.024478,0.026124,-0.028133,0.015279,0.006758,-0.003185,0.263484,0.013703,-0.285473,0.730288,0.435927,0.359782,-0.97697


In [28]:
def drop_correlated_columns(df: pd.DataFrame, cut_off: float = 0.8, to_drop:list[str]=None) -> tuple[pd.DataFrame,list[str]]:
    """
    Function to drop correlated columns of a Data Frame
    :param df: Data frame to be modified
    :param cut_off: Maximum allowed correlation
    :param to_drop: Columns to drop. By default, those with 0 variation
    :return: Data frame with deleted correlated columns, list of dropped columns
    """
    if to_drop is None:
        corr_matrix = df.corr('spearman').abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > cut_off)]
    df.drop(to_drop, axis=1, inplace=True)
    return df, to_drop

In [29]:
x_train, to_drop = drop_correlated_columns(x_train)

In [30]:
x_test = x_test[list(x_train.columns)]

In [31]:
#zapisy
x_train.to_csv('data/x_train.csv', index=False)
x_test.to_csv('data/x_test.csv', index=False)