In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [2]:
# Applying LDA
def apply_lda(X, y, n_components):
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    X_lda = lda.fit_transform(X, y)
    return X_lda


In [3]:
# Function to split and scale data
def split_scalar(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [4]:
# Function to train and evaluate Logistic Regression
def logistic(X_train, y_train, X_test, y_test):
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [5]:
# Function to train and evaluate Linear SVM
def svm_linear(X_train, y_train, X_test, y_test):
    model = SVC(kernel='linear', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [6]:
# Function to train and evaluate Non-Linear SVM
def svm_nl(X_train, y_train, X_test, y_test):
    model = SVC(kernel='rbf', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [7]:
# Function to train and evaluate KNN
def knn(X_train, y_train, X_test, y_test):
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [8]:
# Function to train and evaluate Naive Bayes
def naive_bayes(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [9]:
# Function to train and evaluate Decision Tree
def decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [10]:
# Function to train and evaluate Random Forest
def random_forest(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [11]:
# Function to compile results into a DataFrame
def selectk_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    dataframe = pd.DataFrame(
        data=[[acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf]],
        columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Naive', 'Decision', 'Random'],
        index=['LDA']
    )
    return dataframe

In [12]:
# Load the dataset
dataset1 = pd.read_csv("Wine.csv", index_col=None)

# Correctly access the data
X = dataset1.iloc[:, 0:13].values  # Independent variables
y = dataset1.iloc[:, 13].values    # Dependent variable


In [13]:
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)

In [14]:
df2

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [15]:
# Check shape before applying PCA
print(f"Shape of independent variables before LDA: {X.shape}")

Shape of independent variables before LDA: (178, 13)


In [17]:
# Apply LDA
indep_x_lda = apply_lda(X, y, n_components=2)

In [18]:
indep_x_lda

array([[-4.70024401,  1.97913835],
       [-4.30195811,  1.17041286],
       [-3.42071952,  1.42910139],
       [-4.20575366,  4.00287148],
       [-1.50998168,  0.4512239 ],
       [-4.51868934,  3.21313756],
       [-4.52737794,  3.26912179],
       [-4.14834781,  3.10411765],
       [-3.86082876,  1.95338263],
       [-3.36662444,  1.67864327],
       [-4.80587907,  2.23536271],
       [-3.42807646,  2.17510939],
       [-3.66610246,  2.26248961],
       [-5.58824635,  2.05478773],
       [-5.50131449,  3.61304865],
       [-3.18475189,  2.88952528],
       [-3.28936988,  2.76584266],
       [-2.99809262,  1.42511132],
       [-5.24640372,  3.70982655],
       [-3.13653106,  1.97689922],
       [-3.57747791,  0.5624599 ],
       [-1.69077135,  0.91342136],
       [-4.83515033,  0.9147628 ],
       [-3.09588961,  0.61735888],
       [-3.32164716,  0.29847734],
       [-2.14482223,  0.16369247],
       [-3.9824285 ,  2.17515679],
       [-2.68591432,  1.21850924],
       [-3.56309464,

In [19]:
# Check shape after applying LDA
print(f"Shape of independent variables after LDA: {indep_x_lda.shape}")

Shape of independent variables after LDA: (178, 2)


In [21]:
# Convert LDA results to a DataFrame for display
lda_df = pd.DataFrame(indep_x_lda, columns=["LD1", "LD2"])

In [22]:
lda_df

Unnamed: 0,LD1,LD2
0,-4.700244,1.979138
1,-4.301958,1.170413
2,-3.420720,1.429101
3,-4.205754,4.002871
4,-1.509982,0.451224
...,...,...
173,4.291508,3.390332
174,4.503296,2.083546
175,5.047470,3.196231
176,4.276155,2.431388


In [23]:
# Split and scale data
X_train, X_test, y_train, y_test = split_scalar(indep_x_lda, y)

In [24]:
# Initialize accuracy lists
acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf = [], [], [], [], [], [], []

# Evaluate models and store accuracies
_, accuracy = logistic(X_train, y_train, X_test, y_test)
acclog.append(accuracy)

_, accuracy = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(accuracy)

_, accuracy = svm_nl(X_train, y_train, X_test, y_test)
accsvmnl.append(accuracy)

_, accuracy = knn(X_train, y_train, X_test, y_test)
accknn.append(accuracy)

_, accuracy = naive_bayes(X_train, y_train, X_test, y_test)
accnav.append(accuracy)

_, accuracy = decision_tree(X_train, y_train, X_test, y_test)
accdes.append(accuracy)

_, accuracy = random_forest(X_train, y_train, X_test, y_test)
accrf.append(accuracy)

In [25]:
acclog

[1.0]

In [26]:
accsvml

[0.9777777777777777]

In [27]:
accsvmnl

[1.0]

In [28]:
accknn


[1.0]

In [29]:
accnav

[1.0]

In [30]:
accdes

[0.9777777777777777]

In [31]:
 accrf

[0.9777777777777777]

In [32]:
# Compile results
#Using -1 ensures that the most recent accuracy value (the last one added to the list) is used in the selectk_classification function.
results = selectk_classification(acclog[-1], accsvml[-1], accsvmnl[-1], accknn[-1], accnav[-1], accdes[-1], accrf[-1])

In [33]:
results

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
LDA,1.0,0.977778,1.0,1.0,1.0,0.977778,0.977778
