In [2]:
import zipfile
import os

# Define the path to the uploaded ZIP file and the extraction directory
zip_file_path = 'bank+marketing.zip'
extraction_dir = 'bank_marketing_data/'

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# List the extracted files to understand the contents
extracted_files = os.listdir(extraction_dir)
extracted_files


['bank-additional.zip', 'bank.zip']

In [3]:
# Extract the contents of the two inner ZIP files
for file_name in extracted_files:
    file_path = os.path.join(extraction_dir, file_name)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_dir)

# List the files after extraction
extracted_files_after = os.listdir(extraction_dir)
extracted_files_after


['bank-names.txt',
 'bank.csv',
 'bank-additional.zip',
 'bank.zip',
 '__MACOSX',
 'bank-full.csv',
 'bank-additional']

In [4]:
import pandas as pd

# Load the CSV files to inspect their contents
bank_full_df = pd.read_csv(os.path.join(extraction_dir, 'bank-full.csv'), delimiter=';')
bank_df = pd.read_csv(os.path.join(extraction_dir, 'bank.csv'), delimiter=';')

# Display the first few rows of both datasets
bank_full_head = bank_full_df.head()
bank_head = bank_df.head()

bank_full_head, bank_head


(   age           job  marital  education default  balance housing loan  \
 0   58    management  married   tertiary      no     2143     yes   no   
 1   44    technician   single  secondary      no       29     yes   no   
 2   33  entrepreneur  married  secondary      no        2     yes  yes   
 3   47   blue-collar  married    unknown      no     1506     yes   no   
 4   33       unknown   single    unknown      no        1      no   no   
 
    contact  day month  duration  campaign  pdays  previous poutcome   y  
 0  unknown    5   may       261         1     -1         0  unknown  no  
 1  unknown    5   may       151         1     -1         0  unknown  no  
 2  unknown    5   may        76         1     -1         0  unknown  no  
 3  unknown    5   may        92         1     -1         0  unknown  no  
 4  unknown    5   may       198         1     -1         0  unknown  no  ,
    age          job  marital  education default  balance housing loan  \
 0   30   unemployed  m

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load the dataset (assuming df is already loaded)
# df = pd.read_csv('path_to_bank_full.csv', delimiter=';')

# Prepare the data by encoding categorical features
df = bank_full_df.copy()
label_encoders = {}

# Convert categorical columns to numeric using LabelEncoder
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split the data into features and target
X = df.drop(columns='y')
y = df['y']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoders['y'].classes_)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

          no       0.93      0.93      0.93     11966
         yes       0.46      0.47      0.47      1598

    accuracy                           0.87     13564
   macro avg       0.70      0.70      0.70     13564
weighted avg       0.87      0.87      0.87     13564

