In [1]:
from sklearn.model_selection import train_test_split
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# **HIGHLIGHT START**
from sklearn.decomposition import KernelPCA # Changed import from PCA/LDA to KernelPCA
# **HIGHLIGHT END**
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # Kept for general library import, but won't be used

In [2]:
# --- Data Loading and Preprocessing (Unchanged) ---
dataset1 = pd.read_csv("Wine.csv", index_col=None)
df2 = dataset1

In [3]:
for column in df2.columns:
    df2[column] = pd.to_numeric(df2[column], errors='coerce')

for column in df2.columns:
    if df2[column].isnull().any():
        df2[column] = df2[column].fillna(df2[column].mean())

df2.replace([np.inf, -np.inf], np.nan, inplace=True)
for column in df2.columns:
    if df2[column].isnull().any():
        df2[column] = df2[column].fillna(df2[column].mean())

indep_X = df2.drop('Customer_Segment', axis=1).values
dep_Y = df2['Customer_Segment'].values

In [4]:
def split_scalar(indep_X, dep_Y):
    """Splits data into training and testing sets and scales them."""
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [5]:
def cm_prediction(classifier, X_test, y_test):
    """Makes predictions, calculates metrics, and returns them."""
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred) 
    Accuracy = accuracy_score(y_test, y_pred)
    return classifier, Accuracy, report, cm

In [6]:
def logistic(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [7]:
def svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [8]:
def svm_NL(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [9]:
def Navie(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [10]:
def knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [11]:
def Decision(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [12]:
def random(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [13]:
def KPCA_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf, kpca_components_list):
    """Generates a DataFrame of results for different KPCA component numbers."""
    dataframe = pd.DataFrame(index=kpca_components_list, columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    # **HIGHLIGHT END**
    for number, idex in enumerate(dataframe.index):
        dataframe.loc[kpca_components_list,'Logistic'] = acclog[number]
        dataframe.loc[kpca_components_list,'SVMl'] = accsvml[number]
        dataframe.loc[kpca_components_list,'SVMnl'] = accsvmnl[number]
        dataframe.loc[kpca_components_list,'KNN'] = accknn[number]
        dataframe.loc[kpca_components_list,'Navie'] = accnav[number]
        dataframe.loc[kpca_components_list,'Decision'] = accdes[number]
        dataframe.loc[kpca_components_list,'Random'] = accrf[number]
    return dataframe

In [14]:
# 1. Split and Scale the cleaned data (Done only once)
X_train_base, X_test_base, y_train_base, y_test_base = split_scalar(indep_X, dep_Y)

# **HIGHLIGHT START: Define the range of KernelPCA components to test**
# We'll test 2, 4, and 6 components. KPCA is not limited by C-1.
kpca_components_to_test = [2, 4, 6]
# **HIGHLIGHT END**

# Initialize lists to store accuracies for each model across different KPCA runs
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

In [15]:
# **HIGHLIGHT START: The main loop to iterate over KernelPCA components**
kpca_component_names = [] # To store names like 'KPCA_2', 'KPCA_4' for the final DataFrame index

for n_components in kpca_components_to_test:
    # 2. Apply KernelPCA with the current number of components and RBF kernel
    kpca = KernelPCA(
        n_components=n_components,
        kernel='rbf', 
        gamma=0.04 # A typical starting value for gamma in Wine-like datasets
    )

In [16]:
# We must use the base (scaled, un-transformed) data for each new fit
X_train_kpca = kpca.fit_transform(X_train_base, y_train_base)
X_test_kpca = kpca.transform(X_test_base)

# Set the data for classification
X_train_final = X_train_kpca
X_test_final = X_test_kpca
    
# Run all models and append results (using y_train_base/y_test_base)
_, Accuracy, _, _ = logistic(X_train_final, y_train_base, X_test_final, y_test_base)
acclog.append(Accuracy)

_, Accuracy, _, _ = svm_linear(X_train_final, y_train_base, X_test_final, y_test_base)
accsvml.append(Accuracy)

_, Accuracy, _, _ = svm_NL(X_train_final, y_train_base, X_test_final, y_test_base)
accsvmnl.append(Accuracy)

_, Accuracy, _, _ = knn(X_train_final, y_train_base, X_test_final, y_test_base)
accknn.append(Accuracy)

_, Accuracy, _, _ = Navie(X_train_final, y_train_base, X_test_final, y_test_base)
accnav.append(Accuracy)

_, Accuracy, _, _ = Decision(X_train_final, y_train_base, X_test_final, y_test_base)
accdes.append(Accuracy)

_, Accuracy, _, _ = random(X_train_final, y_train_base, X_test_final, y_test_base)
accrf.append(Accuracy)
    
kpca_component_names.append(f'KPCA_{n_components}') # Store the component name
# **HIGHLIGHT END**

# 3. Calling the classification summary function with the list of component names
# **HIGHLIGHT START: Updated function call to KPCA_Classification**
result = KPCA_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf, kpca_component_names)
# **HIGHLIGHT END**

result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
KPCA_6,1.0,1.0,1.0,1.0,1.0,0.955556,0.977778
