In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer

# Load the datasets
data1 = pd.read_csv('bank-additional.csv', sep=';')
data2 = pd.read_csv('bank-additional-full.csv', sep=';')
data3 = pd.read_csv('bank.csv', sep=';')

# Display the first few rows of each dataset
print("First dataset:")
print(data1.head())
print("\nSecond dataset:")
print(data2.head())
print("\nThird dataset:")
print(data3.head())

# Merge the datasets
# Assuming the structures are similar, concatenate them
data_combined = pd.concat([data1, data2, data3], axis=0, ignore_index=True)

# Display the shape of the combined dataset
print("Combined dataset shape:", data_combined.shape)

# Preprocess the data
# Handle missing values by imputing with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
data_combined = pd.DataFrame(imputer.fit_transform(data_combined), columns=data_combined.columns)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in data_combined.select_dtypes(include=['object']).columns:
    if column != 'y':  # Do not encode the target variable here
        le = LabelEncoder()
        data_combined[column] = le.fit_transform(data_combined[column])
        label_encoders[column] = le

# Encode the target variable
le_y = LabelEncoder()
data_combined['y'] = le_y.fit_transform(data_combined['y'])

# Split the data into training and testing sets
X = data_combined.drop(columns=['y'])
y = data_combined['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Plot the Decision Tree
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=X.columns, class_names=le_y.classes_, filled=True)
plt.show()
