In [None]:
from ucimlrepo import fetch_ucirepo

bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# Analiza targetu

In [None]:
print(type(y))
print(y.shape)
print(y.dtypes)

In [None]:
y_series = y.iloc[:,0]
print(y_series.value_counts())
print(y_series.value_counts(normalize=True))

# Analiza X

### Data shape

In [None]:
X_eda = X.copy()
print(X_eda.head())
print(X_eda.dtypes)
print(X_eda.shape)

## Brakujące wartości

In [None]:
print(X_eda.isna().sum())

In [None]:
missing_cols = X_eda.isna().sum().to_frame()
missing_cols = missing_cols[missing_cols.loc[:,0] > 0].index
for col in missing_cols:
    print(X_eda.loc[:,col].unique())

In [None]:
for col in missing_cols:
    print('unknown' in X_eda.loc[:,col].values)

Kolumny gdzie są brakujące dane maja dtype = object oraz nie mają w sobie defaultowo 'unknown', także zamienimy brakujące wartośći na 'unknown'

In [None]:
X_eda.loc[:,missing_cols] = X_eda.loc[:,missing_cols].fillna('unknown')
print(X_eda.isna().sum())
print(X_eda.dtypes)

## Analiza numerycznych

In [None]:
num_cols = X_eda.select_dtypes(include='number').columns
cat_cols = X_eda.select_dtypes(include='object').columns
df = X_eda.copy()
df['y'] = y_series

### Wartości odstające

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

for col in num_cols:
    plt.figure(figsize=(8,4))
    sb.boxplot(x = df[col])
    plt.title(col)
    plt.show()
    plt.close()

### IQR

In [None]:
q1 = df.loc[:,num_cols].quantile(0.25)
q3 = df.loc[:,num_cols].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 + 1.5*iqr
outliers = ((df.loc[:,num_cols] < lower_bound) | df.loc[:,num_cols] > upper_bound)
outliers_count = outliers.sum()
print(outliers_count)

### Z-Score

In [None]:
from scipy.stats import zscore

z_scores = df.loc[:,num_cols].apply(zscore)
mild_outliers_zscore = (z_scores.abs() > 2).sum()
outliers_zscore = (z_scores.abs() > 3).sum()
print("ABS > 3")
print(outliers_zscore)
print('-'*50)
print("ABS > 2")
print(mild_outliers_zscore)

## Analiza kategorycznych

In [None]:
for col in cat_cols:
    print(col)
    print(f"Number of uniques: {df.loc[:,col].nunique()}")
    print(f"Unknown rate: {((df.loc[:,col] == 'unknown').mean()):.6f}")
    print('-'*75)

In [None]:
global_rate = (df['y']=='yes').mean()
min_threshold = 50

In [None]:
summaries_dict = {}
for col in cat_cols:
    summary = (
        df.groupby(col)["y"]
        .agg(count='size', rate_yes = lambda s: (s=='yes').mean())
        .assign(lift = lambda t: t['rate_yes']/global_rate)
        .loc[lambda x: x['count'] >= min_threshold]
        .sort_values(['rate_yes','count'], ascending = [False,False])
    )
    summaries_dict[col] = summary
    print(summary.head())
    print('-'*75)

# Numeryczne a target

In [None]:
df['y_bin'] = (df['y']=='yes').astype(int)
print(df.head())