In [32]:
# For data manipulation
import pandas as pd
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# For statistical modeling/machine learning
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [35]:
#importing dataset
#!pip install ucimlrepo



In [36]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

#Data Cleaning

Method 1


In [37]:

# Define strategy for imputing missing numeric values
numeric_imputer = SimpleImputer(strategy='median')

# Identify numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Transform the numeric columns
X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])


In [38]:
# Impute missing categorical values with most frequent category
cat_cols = X.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)

# Confirm missing values were removed
print(X.isnull().sum())

age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day_of_week    0
month          0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X[col].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X[col].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X[col].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X[col].mode()[0]

Method 2

In [30]:
from sklearn.datasets import fetch_openml
import pandas as pd

# Load the bank_marketing dataset
bank_marketing = fetch_openml(data_id=222, as_frame=True)

# Extract the DataFrame from the Bunch object
df = bank_marketing.frame

# Identify missing values
missing_values = df.isnull().sum()

# Display missing values
print("Missing Values:\n", missing_values)

# Clean missing values
df_cleaned = df.dropna()  # Drop rows with missing values

# Verify that missing values have been handled
cleaned_missing_values = df_cleaned.isnull().sum()
print("\nMissing Values in Cleaned Data:\n", cleaned_missing_values)

  warn(


Missing Values:
 still_alive     0
age             5
pericardial     0
fractional      7
epss           14
lvdd           10
wall_score      3
wall_index      1
alive_at_1     57
class           0
dtype: int64

Missing Values in Cleaned Data:
 still_alive    0
age            0
pericardial    0
fractional     0
epss           0
lvdd           0
wall_score     0
wall_index     0
alive_at_1     0
class          0
dtype: int64


#Identifying Outliers

Method 1

In [29]:
from scipy import stats

# Identify numeric columns
num_cols = X.select_dtypes(include=['int', 'float']).columns

# Function to calculate z-scores
def z_score(col):
    z = np.abs(stats.zscore(col))
    return pd.Series(z, index=col.index)

# Apply z-score function column-wise
X[num_cols] = X[num_cols].apply(z_score)

# Threshold and view outliers
outliers_count = (X[num_cols] > 3).sum()
print(outliers_count)

# Handle/clip outliers
X[num_cols] = X[num_cols].clip(upper=3)

# Remove flagged outliers
X = X[X[num_cols] < 3]

# Applying z-score function column-wise again after removing outliers
X[num_cols] = X[num_cols].apply(z_score)

# Threshold and view outliers again
outliers_count_after = (X[num_cols] > 3).sum()
print(outliers_count_after)

age             381
balance         745
day_of_week       0
duration        963
campaign        840
pdays          1723
previous        582
dtype: int64
age            0
balance        0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = X[num_cols].apply(z_score)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = X[num_cols].clip(upper=3)


Method 2

In [39]:
# Identify numeric columns
num_cols = X.select_dtypes(include=['int', 'float']).columns

# Function to calculate the IQR and identify outliers
def remove_outliers_iqr(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1

    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify and remove outliers
    outliers_mask = (col < lower_bound) | (col > upper_bound)
    col[outliers_mask] = None  # Set outliers to None or NaN to remove them
    return col

# Apply IQR-based outlier removal column-wise
X[num_cols] = X[num_cols].apply(remove_outliers_iqr)

# Check the summary statistics after handling outliers
summary_after = X[num_cols].describe()

# Display the summary statistics
print("Summary Statistics After Handling Outliers:\n", summary_after)

Summary Statistics After Handling Outliers:
                 age       balance   day_of_week      duration      campaign  \
count  44724.000000  40482.000000  45211.000000  41976.000000  42147.000000   
mean      40.545524    640.636233     15.806419    203.490947      2.129950   
std        9.978232    844.435442      8.322476    140.805074      1.315842   
min       18.000000  -1944.000000      1.000000      0.000000      1.000000   
25%       33.000000     46.000000      8.000000     98.000000      1.000000   
50%       39.000000    349.000000     16.000000    167.000000      2.000000   
75%       48.000000    980.750000     21.000000    277.000000      3.000000   
max       70.000000   3462.000000     31.000000    643.000000      6.000000   

         pdays  previous  
count  36954.0   36954.0  
mean      -1.0       0.0  
std        0.0       0.0  
min       -1.0       0.0  
25%       -1.0       0.0  
50%       -1.0       0.0  
75%       -1.0       0.0  
max       -1.0       0.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = X[num_cols].apply(remove_outliers_iqr)
