In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
dataset_path = "C:\\Users\\taran\\Data Science\\PRODIGY-INFOTECH-TARAN PANFAIR\\bank-full.csv"  # Update this path to the location of your extracted CSV file
data = pd.read_csv(dataset_path, sep=';')

# Display the first few rows of the dataset to understand its structure
print("Dataset Head:\n", data.head())

# Handle missing values by filling them with the most frequent value
for column in data.columns:
    if data[column].isnull().sum() > 0:
        most_frequent = data[column].mode()[0]
        data[column].fillna(most_frequent, inplace=True)

# Encode categorical variables using label encoding
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':  # Check if the column is categorical
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Extract features and target variable
X = data.drop(columns=['y'])
y = data['y']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the validation set
y_pred = clf.predict(X_val)

# Evaluate the classifier
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Dataset Head:
    age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
Accuracy: 0.8740462235983634
Classification Report:
              precision    recall  f1-score 