In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [3]:
def split_scalar(indep_X, dep_Y):
    """Splits data into training and testing sets and scales them."""
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [4]:
def cm_prediction(classifier, X_test, y_test):
    """Predicts and evaluates a classifier, returning accuracy and other metrics."""
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    return classifier, Accuracy, report, X_test, y_test, cm

In [5]:
def logistic(X_train, y_train, X_test, y_test):
    """Fits and evaluates a Logistic Regression classifier."""
    classifier = LogisticRegression(solver='liblinear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [6]:
def svm_linear(X_train, y_train, X_test, y_test):
    """Fits and evaluates a linear SVM classifier."""
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [7]:
def svm_NL(X_train, y_train, X_test, y_test):
    """Fits and evaluates a non-linear SVM classifier (RBF kernel)."""
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [8]:
def Navie(X_train, y_train, X_test, y_test):
    """Fits and evaluates a Gaussian Naive Bayes classifier."""
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [9]:
def knn(X_train, y_train, X_test, y_test):
    """Fits and evaluates a K-Nearest Neighbors classifier."""
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [10]:
def Decision(X_train, y_train, X_test, y_test):
    """Fits and evaluates a Decision Tree classifier."""
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [11]:
def random(X_train, y_train, X_test, y_test):
    """Fits and evaluates a Random Forest classifier."""
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

In [12]:
def PCA_analysis(indep_X, n_components):
    """Performs PCA on the data and returns the principal components."""
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(indep_X)
    return pca_features

In [13]:
def PCA_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    """Creates a DataFrame to display the results of PCA-based classification."""
    dataframe = pd.DataFrame(index=['PCA'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    dataframe['Logistic']['PCA'] = acclog[0]
    dataframe['SVMl']['PCA'] = accsvml[0]
    dataframe['SVMnl']['PCA'] = accsvmnl[0]
    dataframe['KNN']['PCA'] = accknn[0]
    dataframe['Navie']['PCA'] = accnav[0]
    dataframe['Decision']['PCA'] = accdes[0]
    dataframe['Random']['PCA'] = accrf[0]
    return dataframe

In [20]:
# Load and preprocess the dataset
dataset1 = pd.read_csv("wine.csv", index_col=None)
# Handle NaN and infinity values
for column in df2.columns:
    df2[column] = pd.to_numeric(df2[column], errors='coerce')
    if df2[column].isnull().any():
        df2[column] = df2[column].fillna(df2[column].mean())
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
for column in df2.columns:
    if df2[column].isnull().any():
        df2[column] = df2[column].fillna(df2[column].mean())

In [21]:
# Split the data into independent and dependent variables
indep_X = df2.drop('Customer_Segment', axis=1)
dep_Y = df2['Customer_Segment']

In [36]:
# --- Perform PCA and Classify ---
n_pca_components = 3 
print(f"Applying PCA with {n_pca_components} components...")
pca_features = PCA_analysis(indep_X, n_pca_components)

Applying PCA with 3 components...


In [39]:
# Split and scale the PCA-transformed data
X_train, X_test, y_train, y_test = split_scalar(pca_features, dep_Y)

# Initialize lists to store accuracies
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

# Train and evaluate all models using the PCA-transformed data
print("Training and evaluating models on PCA-transformed data...")
classifier, Accuracy, report, _, _, _ = logistic(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)
classifier, Accuracy, report, _, _, _ = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)
classifier, Accuracy, report, _, _, _ = svm_NL(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)
classifier, Accuracy, report, _, _, _ = knn(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)
classifier, Accuracy, report, _, _, _ = Navie(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)
classifier, Accuracy, report, _, _, _ = Decision(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)
classifier, Accuracy, report, _, _, _ = random(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

# Create and print the final results DataFrame
result = PCA_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
print("\nPCA Classification Results:")
result

Training and evaluating models on PCA-transformed data...

PCA Classification Results:


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic']['PCA'] = acclog[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame o

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.888889,0.888889,0.866667,0.822222,0.866667,0.844444,0.844444


In [29]:
#5
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.933333,0.955556,1.0,0.933333,0.977778,0.933333,0.977778


In [32]:
#6
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.933333,0.933333,0.977778,0.955556,0.977778,0.911111,0.955556


In [35]:
#4
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.911111,0.911111,0.933333,0.911111,0.955556,0.911111,0.911111


In [38]:
#3
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.888889,0.888889,0.866667,0.822222,0.866667,0.844444,0.844444
