In [None]:
import pandas as pd
import numpy as np
import time
import warnings
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [6]:
# Function to perform feature selection using Chi-Square test
def select_k_best_features(X, y, k):
    selector = SelectKBest(score_func=chi2, k=k)
    selector.fit(X, y)
    selected_columns = X.columns[selector.get_support()]
    X_selected = selector.transform(X)
    return selected_columns, X_selected

# Function to split data and scale it
def split_and_scale(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, report, cm

# Function to train and evaluate classifiers
def train_classifier(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    return evaluate_model(classifier, X_test, y_test)

# Load dataset
dataset = pd.read_csv("prep.csv")
dataset_encoded = pd.get_dummies(dataset, drop_first=True)
X = dataset_encoded.drop('classification_yes', axis=1)
y = dataset_encoded['classification_yes']

# Feature selection
selected_columns, X_selected = select_k_best_features(X, y, 6)

# Split and scale data
X_train, X_test, y_train, y_test = split_and_scale(X_selected, y)

# Train models and store results
models = {
    "Logistic Regression": LogisticRegression(random_state=0),
    "SVM Linear": SVC(kernel='linear', random_state=0),
    "SVM Non-Linear": SVC(kernel='rbf', random_state=0),
    "KNN": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "Random Forest": RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
}

results = {}
for name, model in models.items():
    accuracy, report, cm = train_classifier(model, X_train, y_train, X_test, y_test)
    results[name] = accuracy

# Display results
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
print(df_results)


                     Accuracy
Logistic Regression      0.95
SVM Linear               0.96
SVM Non-Linear           0.96
KNN                      0.93
Naive Bayes              0.89
Decision Tree            0.97
Random Forest            0.97


In [7]:
selected_columns

Index(['al', 'bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')