In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Step 1: Local file 'bank.csv' ko load karo (semicolon separator use hota hai is dataset mein)
df = pd.read_csv('bank.csv')  # Default separator is comma, so no need for sep=';'

# Step 2: Dataset ko explore karo
print("Shape:", df.shape)  # Rows aur columns ki sankhya
print("Columns:", df.columns)  # Column ke naam
print(df.head())  # Pehli 5 rows dekho
print(df['job'].value_counts())  # Har job category ka count
print(df['marital'].value_counts())  # Marital status ka count
print(df['y'].value_counts())  # Loan accept kiya ya nahi

# Step 3: Visualizations banao
plt.hist(df['age'], bins=20, color='skyblue', edgecolor='black')  # Age ka distribution
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

pd.crosstab(df['job'], df['y']).plot(kind='bar', stacked=True)  # Job ke hisaab se loan acceptance
plt.title('Loan Acceptance by Job')
plt.xlabel('Job')
plt.ylabel('Count')
plt.show()

pd.crosstab(df['marital'], df['y']).plot(kind='bar', stacked=True)  # Marital status ke hisaab se loan acceptance
plt.title('Loan Acceptance by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.show()

# Step 4: Categorical columns ko encode karo
df_encoded = df.copy()
for col in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Step 5: Model ke liye data prepare karo
X = df_encoded.drop('y', axis=1)  # Features
y = df_encoded['y']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Decision Tree model train karo
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 7: Model ko evaluate karo
print(confusion_matrix(y_test, y_pred))  # Confusion matrix
print(classification_report(y_test, y_pred))  # Accuracy, precision, recall, etc.

# Step 8: Feature importance visualize karo
features = X.columns
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(features)), importances[indices], align='center')
plt.xticks(range(len(features)), features[indices], rotation=90)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()


Shape: (11162, 17)
Columns: Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')
   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown  

KeyError: 'y'