# PCA - Principle Component Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Data Collection

raw_data = pd.read_csv("CKD.csv")
raw_data

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [3]:
df = raw_data

In [4]:
df = pd.get_dummies(df, dtype=int, drop_first=True)

In [5]:
# Input Output Split

indep_x = df.drop('classification_yes',axis = 1)
dep_y = df['classification_yes']

In [6]:
def cm_prediction(classifier,x_test,y_test):
    y_pred = classifier.predict(x_test)
    
    cm = confusion_matrix(y_test,y_pred)
    Accuracy = accuracy_score(y_test,y_pred)
    Report = classification_report(y_test,y_pred)
    return cm,Accuracy,Report

def logistic(x_train,y_train,x_test,y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def svm_linear(x_train,y_train,x_test,y_test):
    classifier = SVC(kernel='linear',random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def svm_nonlinear(x_train,y_train,x_test,y_test):
    classifier = SVC(kernel = 'rbf', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def knn(x_train,y_train,x_test,y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def naive(x_train,y_train,x_test,y_test):   
    classifier = GaussianNB()
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def decision(x_train,y_train,x_test,y_test):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def random(x_train,y_train,x_test,y_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test)

def PCA_results(acclog,accsvml,accsvmnl,accknn,accnav,accdc,accrf):
    PCAdataframe = pd.DataFrame(index = ['PCA'], columns = ['Logistic', 'SVM_l', 'SVM_nl', 'KNN',
                                                                                              'Naive','Decision','Random'])
    for index in (PCAdataframe.index):
        PCAdataframe['Logistic'][index] = acclog
        PCAdataframe['SVM_l'][index] = accsvml
        PCAdataframe['SVM_nl'][index] = accsvmnl
        PCAdataframe['KNN'][index] = accknn
        PCAdataframe['Naive'][index] = accnav
        PCAdataframe['Decision'][index] = accdc
        PCAdataframe['Random'][index] = accrf
    return PCAdataframe


In [15]:
# Train Test Split
x_train,x_test,y_train,y_test  = train_test_split(indep_x,dep_y, test_size=0.2, random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# PCA Model Creation
pca = PCA(n_components = 1)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [16]:
# Store Accuracies
acclog =[] 
accsvml = []
accsvmnl = []
accknn = []
accnav = [] 
accdc = []
accrf = []

In [17]:
# Calculate accuracies for each classifier

cm,Accuracy,Report = logistic(x_train_pca, y_train, x_test_pca, y_test)
acclog.append(Accuracy)

cm,Accuracy,Report = svm_linear(x_train_pca, y_train, x_test_pca, y_test)
accsvml.append(Accuracy)

cm,Accuracy,Report = svm_nonlinear(x_train_pca, y_train, x_test_pca, y_test)
accsvmnl.append(Accuracy)

cm,Accuracy,Report = knn(x_train_pca, y_train, x_test_pca, y_test)
accknn.append(Accuracy)

cm,Accuracy,Report = naive(x_train_pca, y_train, x_test_pca, y_test)
accnav.append(Accuracy)

cm,Accuracy,Report = decision(x_train_pca, y_train, x_test_pca, y_test)
accdc.append(Accuracy)

cm,Accuracy,Report = random(x_train_pca, y_train, x_test_pca, y_test)
accrf.append(Accuracy)

# Create the results DataFrame
result = PCA_results(acclog[0], accsvml[0], accsvmnl[0], accknn[0], accnav[0], accdc[0], accrf[0])


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  PCAdataframe['Logistic'][index] = acclog
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame o

In [10]:
result
# 2

Unnamed: 0,Logistic,SVM_l,SVM_nl,KNN,Naive,Decision,Random
PCA,0.9875,0.9875,1.0,1.0,1.0,1.0,1.0


In [18]:
result
# 1

Unnamed: 0,Logistic,SVM_l,SVM_nl,KNN,Naive,Decision,Random
PCA,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Choose the Best Model

In [14]:
best_model_accuracy = result.max().max()  # Overall max accuracy
best_model_location = result.stack().idxmax()  # Find exact location (row, column) of max accuracy
best_model = best_model_location[0]  # This gives the exact column (model) name

In [15]:
if best_model == 'Logistic':
    best_model = LogisticRegression(random_state=0)
elif best_model == 'SVM_l':
    best_model = SVC(kernel = 'linear', random_state=0)
elif best_model == 'SVM_nl':
    best_model = SVC(kernel = 'rbf', random_state=0)
elif best_model == 'KNN':
    best_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p =2)
elif best_model == 'Naive':
    best_model = GaussianNB()
elif best_model == 'Decision':
    best_model = DecisionTreeClassifier(criterian = 'entropy', random_state=0)
elif best_model == 'Random':
    best_model = RandomFoprestClassifier(n_estimators = 10, criterion = 'entropy', random_state=0)


In [16]:
best_model.fit(x_train_pca, y_train)

AttributeError: 'str' object has no attribute 'fit'

### Save the Best Model

In [None]:
print(f"Best model ({best_model}) saved as {filename} with accuracy: {best_model_accuracy}")

### Getting User input

In [None]:
age_input=int(input("Age: "))

### Preprocess the User Input

In [20]:
User_input_prepro = sc.fit_transform([[age_input,bp_input,al_input,su_input,bgr_input,bu_input,sc_input,sod_input,pot_input,hrmo_input,pcv_input,wc_input,rc_input,sg_b_input,sg_c_input,sg_d_input,sg_e_input,rbc_normal_input,pc_normal_input,pcc_present_input,ba_present_input,htn_yesy_input,dm_yes_input,cad_yes_input,appet_yes_input,pe_yes_input,ane_yes_input]])
User_input_prepro

NameError: name 'age_input' is not defined

### Predictions

In [None]:
Predictions = best_model.predict([[age_input,bp_input,al_input,su_input,bgr_input,bu_input,sc_input,sod_input,pot_input,hrmo_input,pcv_input,wc_input,rc_input,sg_b_input,sg_c_input,sg_d_input,sg_e_input,rbc_normal_input,pc_normal_input,pcc_present_input,ba_present_input,htn_yesy_input,dm_yes_input,cad_yes_input,appet_yes_input,pe_yes_input,ane_yes_input]])
Predictions