In [1]:
# Task 12 - Classification Algorithms II
# Multiple classification tasks in one notebook
# Author: Uneeba
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Helper function to encode categorical columns
def encode_cats(df):
    for col in df.select_dtypes(include='object').columns:
        df[col] = LabelEncoder().fit_transform(df[col])
    return df


 1. **Employee Attrition Prediction - Logistic Regression**

In [9]:
# 1. Employee AttritioRegressionn - Logistic
print("Employee Attrition Prediction")
df = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition (1).csv')
df = encode_cats(df) # Move encode_cats before fillna
df.fillna(df.median(), inplace=True)
X = df.drop('Attrition', axis=1)
y = df['Attrition']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Precision: {precision_score(y_test, pred):.2f}, Recall: {recall_score(y_test, pred):.2f}, F1: {f1_score(y_test, pred):.2f}\n")

Employee Attrition Prediction
Precision: 0.68, Recall: 0.33, F1: 0.45



**2. Heart Disease - KNN**

In [11]:

# 2. Heart Disease - KNN
print("Heart Disease Prediction")
df = pd.read_csv('/heart.csv')
df.fillna(df.mean(), inplace=True)
df = encode_cats(df)
X = df.drop('target', axis=1)
y = df['target']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:,1]
print(f"Accuracy: {accuracy_score(y_test, pred):.2f}, ROC-AUC: {roc_auc_score(y_test, proba):.2f}\n")

Heart Disease Prediction
Accuracy: 0.92, ROC-AUC: 0.92



**3. Hospital Readmission - Logistic** **Regression**






In [13]:
# 3. Hospital Readmission - Logistic Regression
print(" Hospital Readmission Prediction")
df = pd.read_csv('/diabetic_data.csv')
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
for col in df.select_dtypes(include=['int64','float64']).columns:
    df[col].fillna(df[col].median(), inplace=True)
df = encode_cats(df)
X = df.drop('readmitted', axis=1)
y = df['readmitted']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Precision: {precision_score(y_test, pred, average='weighted'):.2f}, Recall: {recall_score(y_test, pred, average='weighted'):.2f}, F1: {f1_score(y_test, pred, average='weighted'):.2f}\n")

 Hospital Readmission Prediction
Precision: 0.48, Recall: 0.55, F1: 0.49



** 4. Credit Card Fraud - Decision Tree**

In [6]:
# 4. Credit Card Fraud - Decision Tree
print(" ")
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
df = pd.read_csv('/content/creditcard.csv')
df.fillna(df.median(), inplace=True)
X = df.drop('Class', axis=1)
y = df['Class']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:,1]
print(f"ROC-AUC: {roc_auc_score(y_test, proba):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, pred), "\n")

 
ROC-AUC: nan
Confusion Matrix:
[[795]] 





** 5. Wine Quality - Decision Tree**



In [10]:
# 5. Wine Quality - Decision Tree
print(" Wine Quality Classification")
df = pd.read_csv('/content/winequality-white.csv', sep=';', quotechar='"', header=0) # Added sep, quotechar, and header
df = encode_cats(df) # Moved encode_cats before fillna
df.fillna(df.median(), inplace=True)
X = df.drop('quality', axis=1)
y = df['quality']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
try:
    roc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
except:
    roc = np.nan
print(f"Accuracy: {acc:.2f}, ROC-AUC: {roc:.2f}\n")

 Wine Quality Classification
Accuracy: 0.61, ROC-AUC: nan



**6. SMS Spam - Naive Bayes**

In [12]:
# 6. SMS Spam - Naive Bayes
print("SMS Spam Classification")
df = pd.read_csv('/content/SMSSpamCollection', sep='\t', names=['label','message'])
df['message'] = df['message'].astype(str).str.lower().str.replace(r'\W', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
df['label'] = df['label'].map({'ham':0, 'spam':1})
df.dropna(subset=['label'], inplace=True) # Drop rows where label is NaN after mapping
df = df[df['message'] != ''].copy() # Drop rows where message is empty after cleaning

tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['message'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:,1]

print(f"Precision: {precision_score(y_test, pred):.2f}, Recall: {recall_score(y_test, pred):.2f}, F1: {f1_score(y_test, pred):.2f}, ROC-AUC: {roc_auc_score(y_test, proba):.2f}\n")

SMS Spam Classification
Precision: 1.00, Recall: 0.87, F1: 0.93, ROC-AUC: 0.99



**7. Diabetes Prediction - RandomForest**



In [14]:
# 7. Diabetes Prediction - RandomForest
print(" Diabetes Prediction")
df = pd.read_csv('/content/diabetes (1).csv')
df.fillna(df.median(), inplace=True)
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:,1]
print(f"Accuracy: {accuracy_score(y_test, pred):.2f}, ROC-AUC: {roc_auc_score(y_test, proba):.2f}")
print("Feature importances:")
for f, imp in zip(df.columns[:-1], model.feature_importances_):
    print(f"  {f}: {imp:.3f}")
print()

 Diabetes Prediction
Accuracy: 0.73, ROC-AUC: 0.81
Feature importances:
  Pregnancies: 0.077
  Glucose: 0.259
  BloodPressure: 0.088
  SkinThickness: 0.066
  Insulin: 0.076
  BMI: 0.170
  DiabetesPedigreeFunction: 0.124
  Age: 0.141



**8. Iris Flower Classification**


In [17]:
# 8. Iris - SVM
print("Iris Flower Classification")
iris = load_iris()
X, y = iris.data, iris.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVC(kernel='linear', probability=True, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred):.2f}, Precision: {precision_score(y_test, pred, average='macro'):.2f}, Recall: {recall_score(y_test, pred, average='macro'):.2f}\n")

Iris Flower Classification
Accuracy: 0.97, Precision: 0.97, Recall: 0.96



**9. Breast Cancer- KNN**

In [22]:
# 9. Breast Cancer - KNN
print(" Breast Cancer Classification")
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, pred))

9) Breast Cancer Classification
Accuracy: 0.95
Confusion Matrix:
[[40  3]
 [ 3 68]]
