In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Applying Kernel PCA
def apply_kernel_pca(X, n_components, kernel):
    kpca = KernelPCA(n_components=n_components, kernel=kernel, random_state=42)
    X_kpca = kpca.fit_transform(X)
    return X_kpca


In [3]:
# Function to split and scale data
def split_scalar(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [4]:
# Function to train and evaluate Logistic Regression
def logistic(X_train, y_train, X_test, y_test):
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [5]:
# Function to train and evaluate Linear SVM
def svm_linear(X_train, y_train, X_test, y_test):
    model = SVC(kernel='linear', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [6]:
# Function to train and evaluate Non-Linear SVM
def svm_nl(X_train, y_train, X_test, y_test):
    model = SVC(kernel='rbf', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [7]:
# Function to train and evaluate KNN
def knn(X_train, y_train, X_test, y_test):
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [8]:
# Function to train and evaluate Naive Bayes
def naive_bayes(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [9]:
# Function to train and evaluate Decision Tree
def decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [10]:
# Function to train and evaluate Random Forest
def random_forest(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred), y_pred, y_test

In [11]:
# Function to compile results into a DataFrame
def selectk_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    dataframe = pd.DataFrame(
        data=[[acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf]],
        columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Naive', 'Decision', 'Random'],
        index=['KernelPCA']
    )
    return dataframe

In [12]:
# Load the dataset
dataset1 = pd.read_csv("Wine.csv", index_col=None)

# Correctly access the data
X = dataset1.iloc[:, 0:13].values  # Independent variables
y = dataset1.iloc[:, 13].values    # Dependent variable


In [13]:
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)

In [14]:
df2

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [15]:
# Check shape before applying KPCA
print(f"Shape of independent variables before KernelPCA: {X.shape}")

Shape of independent variables before KernelPCA: (178, 13)


In [16]:
# Apply Kernel PCA
indep_x_kpca = apply_kernel_pca(X, n_components=5, kernel='rbf')

In [17]:
indep_x_kpca

array([[-5.57694222e-03, -9.80961717e-03, -8.39398611e-03,
        -6.72502828e-03, -9.10652258e-03],
       [-7.01078996e-03, -1.26665649e-02, -1.33610085e-02,
         6.44795256e-02,  1.89432088e-03],
       [-5.57472147e-03, -9.80559110e-03, -8.39008091e-03,
        -6.72139925e-03, -9.10125964e-03],
       [-5.57462276e-03, -9.80541215e-03, -8.38990734e-03,
        -6.72123798e-03, -9.10102577e-03],
       [-5.57462276e-03, -9.80541215e-03, -8.38990735e-03,
        -6.72123798e-03, -9.10102577e-03],
       [-5.57462276e-03, -9.80541215e-03, -8.38990734e-03,
        -6.72123798e-03, -9.10102577e-03],
       [-6.21142427e-03, -1.09710715e-02, -9.57135266e-03,
        -7.88888783e-03, -1.08507014e-02],
       [-5.57462290e-03, -9.80541240e-03, -8.38990759e-03,
        -6.72123820e-03, -9.10102609e-03],
       [-2.25103767e-02, -4.35074750e-02, -6.66618972e-02,
         8.22700837e-01,  1.18588847e-01],
       [-2.23740307e-02, -4.32568616e-02, -6.63787289e-02,
         8.21228813e-01

In [18]:
# Check shape after applying LDA
print(f"Shape of independent variables after Kernel PCA: {indep_x_kpca.shape}")

Shape of independent variables after Kernel PCA: (178, 5)


In [19]:
# Convert Kernel PCA results to a DataFrame for display
kpca_df = pd.DataFrame(indep_x_kpca, columns=["KP1", "KP2", "KP3", "KP4", "KP5"])


In [20]:
kpca_df

Unnamed: 0,KP1,KP2,KP3,KP4,KP5
0,-0.005577,-0.009810,-0.008394,-0.006725,-0.009107
1,-0.007011,-0.012667,-0.013361,0.064480,0.001894
2,-0.005575,-0.009806,-0.008390,-0.006721,-0.009101
3,-0.005575,-0.009805,-0.008390,-0.006721,-0.009101
4,-0.005575,-0.009805,-0.008390,-0.006721,-0.009101
...,...,...,...,...,...
173,-0.005575,-0.009806,-0.008390,-0.006722,-0.009102
174,-0.011826,-0.021543,-0.021940,-0.023695,-0.039089
175,-0.006845,-0.012125,-0.010713,-0.008980,-0.012457
176,-0.006851,-0.012136,-0.010723,-0.008989,-0.012470


In [21]:
# Split and scale data
X_train, X_test, y_train, y_test = split_scalar(indep_x_kpca, y)

In [22]:
# Initialize accuracy lists
acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf = [], [], [], [], [], [], []

# Evaluate models and store accuracies
classifier, accuracy, report, cm = logistic(X_train, y_train, X_test, y_test)
acclog.append(accuracy)

classifier, accuracy, report, cm = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(accuracy)

classifier, accuracy, report, cm = svm_nl(X_train, y_train, X_test, y_test)
accsvmnl.append(accuracy)

classifier, accuracy, report, cm = knn(X_train, y_train, X_test, y_test)
accknn.append(accuracy)

classifier, accuracy, report, cm = naive_bayes(X_train, y_train, X_test, y_test)
accnav.append(accuracy)

classifier, accuracy, report, cm = decision_tree(X_train, y_train, X_test, y_test)
accdes.append(accuracy)

classifier, accuracy, report, cm = random_forest(X_train, y_train, X_test, y_test)
accrf.append(accuracy)

In [23]:
acclog

[0.4222222222222222]

In [24]:
accsvml

[0.4222222222222222]

In [25]:
accsvmnl

[0.4222222222222222]

In [26]:
accknn


[0.4444444444444444]

In [27]:
accnav

[0.3333333333333333]

In [28]:
accdes

[0.5555555555555556]

In [29]:
 accrf

[0.6]

In [30]:
# Compile results
#Using -1 ensures that the most recent accuracy value (the last one added to the list) is used in the selectk_classification function.
results = selectk_classification(acclog[-1], accsvml[-1], accsvmnl[-1], accknn[-1], accnav[-1], accdes[-1], accrf[-1])

In [31]:
results

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
KernelPCA,0.422222,0.422222,0.422222,0.444444,0.333333,0.555556,0.6
