<a href="https://colab.research.google.com/github/S-Tissera/CM2604_/blob/main/CM2604.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Understanding the Dataset**





In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Fetch the dataset
bank_marketing = fetch_ucirepo(id=222)

# Data (as pandas DataFrames)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# Combine features and target into a single DataFrame for easier analysis
data = pd.concat([X, y], axis=1)

# Display Metadata
print("Metadata:")
print(bank_marketing.metadata)

# Display Variable Information
print("\nVariable Information:")
print(bank_marketing.variables)

# Show the first few rows
print("\nFirst Few Rows:")
print(data.head())

# Basic Information
print("\nDataset Info:")
print(data.info())

# Missing Values
print("\nMissing Values:")
print(data.isnull().sum())

# Statistical Summary
print("\nStatistical Summary:")
print(data.describe())


Metadata:
{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to pre

# **2. Cleaning Dataset**

***Handling Missing Values***


In [None]:
print("\nChecking Missing Values:")
print(data.isnull().sum())

# Fill missing values for numerical features with mean
for col in data.select_dtypes(include=['int64', 'float64']).columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].mean())  # Reassign to avoid inplace=True

# Fill missing values for categorical features with mode
for col in data.select_dtypes(include=['object']).columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].mode()[0])  # Reassign to avoid inplace=True

print("\nMissing Values after Cleaning:")
print(data.isnull().sum())



Checking Missing Values:
age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

Missing Values after Cleaning:
age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day_of_week    0
month          0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64


***Remove Duplicates***

In [None]:
print("\nChecking for Duplicates:")
print(f"Duplicate Rows: {data.duplicated().sum()}")

# Drop duplicates
data.drop_duplicates(inplace=True)

print(f"Duplicate Rows after Removal: {data.duplicated().sum()}")


Checking for Duplicates:
Duplicate Rows: 0
Duplicate Rows after Removal: 0


***Handle Outliers using IQR***

In [None]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

print("\nDataset Shape after Outlier Removal:")
print(data.shape)


Dataset Shape after Outlier Removal:
(28069, 17)


***Encode Categorical Variables***

In [None]:
categorical_cols = data.select_dtypes(include=['object']).columns
print("\nEncoding Categorical Features:")
print(categorical_cols)

# Apply One-Hot Encoding
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


Encoding Categorical Features:
Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'y'],
      dtype='object')
