In [2]:
!pip install -q imbalanced-learn

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

# Drop rows with missing TotalCharges
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda x: LabelEncoder().fit_transform(x))

# Clustering
features_for_clustering = df.drop("Churn", axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_for_clustering)

# KMeans
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans_labels = kmeans.fit_predict(X_scaled)
sil_kmeans = silhouette_score(X_scaled, kmeans_labels)

# DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
sil_dbscan = silhouette_score(X_scaled, dbscan_labels)

print("Silhouette Score (KMeans):", sil_kmeans)
print("Silhouette Score (DBSCAN):", sil_dbscan)

# Classification - target is 'Churn'
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Feature selection
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# SMOTE
print("Original Class Distribution:", dict(zip(*np.unique(y_train, return_counts=True))))
min_class_count = min(np.unique(y_train, return_counts=True)[1])
k_neighbors = min(5, min_class_count - 1)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print("Balanced Class Distribution:", dict(zip(*np.unique(y_train_bal, return_counts=True))))

# Classifiers
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    print(f"\n🔹 {name}")
    print("Accuracy:", model.score(X_test, y_test))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Silhouette Score (KMeans): 0.13819885307296784
Silhouette Score (DBSCAN): -0.13682720246665067
Original Class Distribution: {np.int64(0): np.int64(4130), np.int64(1): np.int64(1495)}
Balanced Class Distribution: {np.int64(0): np.int64(4130), np.int64(1): np.int64(4130)}

🔹 Logistic Regression
Accuracy: 0.6943852167732765
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.65      0.76      1033
           1       0.46      0.81      0.58       374

    accuracy                           0.69      1407
   macro avg       0.68      0.73      0.67      1407
weighted avg       0.79      0.69      0.71      1407

Confusion Matrix:
 [[675 358]
 [ 72 302]]

🔹 Decision Tree
Accuracy: 0.7199715707178393
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81      1033
           1       0.47      0.48      0.48       374

    accuracy                           0.72      1

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('heart.csv')

# Show column names to identify target
print("Columns in dataset:", df.columns.tolist())

# Infer target column
# Replace with correct target column name if needed
possible_targets = ['target', 'HeartDisease', 'output']
target_column = None
for col in possible_targets:
    if col in df.columns:
        target_column = col
        break

if not target_column:
    raise ValueError("Target column not found. Please check column names.")

# Encode categorical features if any
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ----- PCA -----
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

clf_pca = RandomForestClassifier()
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)

print("\n PCA + Random Forest")
print(classification_report(y_test, y_pred_pca))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_pca))

# ----- LDA -----
lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X_scaled, y)

X_train_lda, X_test_lda, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42)

clf_lda = RandomForestClassifier()
clf_lda.fit(X_train_lda, y_train)
y_pred_lda = clf_lda.predict(X_test_lda)

print("\n LDA + Random Forest")
print(classification_report(y_test, y_pred_lda))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lda))


Columns in dataset: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']

 PCA + Random Forest
              precision    recall  f1-score   support

           0       0.77      0.88      0.82        77
           1       0.91      0.81      0.86       107

    accuracy                           0.84       184
   macro avg       0.84      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

Confusion Matrix:
 [[68  9]
 [20 87]]

 LDA + Random Forest
              precision    recall  f1-score   support

           0       0.71      0.87      0.78        77
           1       0.89      0.75      0.81       107

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.82      0.80      0.80       184

Confusion Matrix:
 [[67 10]
 [27 80]]


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
df = pd.read_csv('bank-additional-full.csv', sep=';')

# Drop rows with missing values (if any)
df = df.dropna()

# Encode categorical features
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# Show target value distribution
print("Target Distribution:\n", df['y'].value_counts())

# Define features and target
X = df.drop('y', axis=1)
y = df['y']

# Split data before any balancing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline model (before balancing)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n Before Balancing")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ----------------------------
# Apply SMOTE Oversampling
sm = SMOTE(random_state=42)
X_res_sm, y_res_sm = sm.fit_resample(X, y)

X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_res_sm, y_res_sm, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_sm, y_train_sm)
y_pred_sm = clf.predict(X_test_sm)

print("\n After SMOTE Balancing")
print(classification_report(y_test_sm, y_pred_sm))
print("Confusion Matrix:\n", confusion_matrix(y_test_sm, y_pred_sm))

# ----------------------------
# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_res_rus, y_res_rus = rus.fit_resample(X, y)

X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_res_rus, y_res_rus, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_rus, y_train_rus)
y_pred_rus = clf.predict(X_test_rus)

print("\n After RandomUnderSampler")
print(classification_report(y_test_rus, y_pred_rus))
print("Confusion Matrix:\n", confusion_matrix(y_test_rus, y_pred_rus))


Target Distribution:
 y
0    36548
1     4640
Name: count, dtype: int64

 Before Balancing
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7303
           1       0.65      0.51      0.57       935

    accuracy                           0.91      8238
   macro avg       0.79      0.74      0.76      8238
weighted avg       0.91      0.91      0.91      8238

Confusion Matrix:
 [[7045  258]
 [ 456  479]]

 After SMOTE Balancing
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      7332
           1       0.94      0.97      0.95      7288

    accuracy                           0.95     14620
   macro avg       0.95      0.95      0.95     14620
weighted avg       0.95      0.95      0.95     14620

Confusion Matrix:
 [[6891  441]
 [ 245 7043]]

 After RandomUnderSampler
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       914
   