In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = "bank-additional-full.csv"  # Update the file path if needed
df = pd.read_csv(file_path, delimiter=";")

# Encode categorical variables using LabelEncoder
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    if col != "y":  # Exclude target variable for now
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Encode target variable ("y") separately
df["y"] = df["y"].map({"no": 0, "yes": 1})

# Split data into features and target
X = df.drop(columns=["y"])
y = df["y"]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


Accuracy: 0.8956
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      7310
           1       0.53      0.57      0.55       928

    accuracy                           0.90      8238
   macro avg       0.74      0.75      0.75      8238
weighted avg       0.90      0.90      0.90      8238

