<a href="https://www.kaggle.com/code/pratikpal1/diabetics-detection-using-various-ml-models?scriptVersionId=108591617" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling as pp

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
import xgboost
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats
%matplotlib inline

import os
#from IPython.display import Image

#import shutil
#shutil.rmtree("../input/diabetes-dataset")


In [None]:
df = pd.read_csv('../input/diabetes-dataset/diabetes2.csv')

## EDA

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

#### There aren't any blanks and there are 768 records and 9 features
#### Below is a heatmap to check if there is any blank data

In [None]:
sns.heatmap(df.isnull(),yticklabels= False,cbar = False)

### Removing low Variance Features variance (features with variance less than 0.1)

In [None]:
var = df.var()
col_name = df.columns
index=0
quasi_const = []
for i in var:
    if i < 0.1:
        quasi_const.append(col_name[index])
    index +=1
print(quasi_const)

#### There are no columns with variance less than 1 

#### seems the data set is already cleaned and ready to go

In [None]:
sns.pairplot(df, hue = 'Outcome')

#### Checking the distribution of the target column

In [None]:
sns.histplot(x=df['Outcome'], hue= df['Outcome'])
plt.show()

#### There are more healthy patients than Diabetic patients, we need to check if that would affect our predictions models

#### Getting a profile report to check the correlations and the awesom visualizations

#### Note: The line '%matplotlib inline' needs to be written after importing pandas_profiling otherwise, we may not be able to see garphs that we plot after using the profile report function

In [None]:
pp.ProfileReport(df)

#### BloodPressure is correlated with BMI
#### Pregnancies are correlated with Age
#### BloodPressure, SkinThinkness, Insulin cannot be zero, they have to be filled
#### We will be filling the mean value of the columns for every zero that we get

## Data Cleanning

#### From the profile report we could see that the insulin has a large number of zeros, as such lets drop the column.
#### Also for the features like BloodPressure and SkinThickness 0s are replaced by mean

In [None]:
df.drop('Insulin', axis =1, inplace = True)

#for i in ['BloodPressure','SkinThickness','Insulin']:
for i in ['BloodPressure','SkinThickness']:
    df[i].replace(0,np.nan,inplace =True)
    df[i].fillna(df[i].mean(),inplace = True)

#### some of the features are skewed, we will have to handle this before training the model

In [None]:
df.skew()

#### Visualizing the features for skewness

In [None]:
for i in df.columns:
    if(i != 'Outcome'):
        plt.figure(figsize = (14,4))
        plt.subplot(1,2,1)
        #the figure has 1 row, 2 columns, and this plot is the first (1) plot.
        sns.histplot(x =df[i], hue = df['Outcome'])
        plt.title(i)
    
        plt.subplot(1,2,2)
        #the figure has 1 row, 2 columns, and this plot is the second (2) plot.
        stats.probplot(x =df[i], dist ='norm', plot = plt)               
    plt.show()

## Power transform
#### We will need to normailizing and transforming the features to remove the skewness and resemble the normal distribution
[![1-LXu-BIQBwor-Bt-Fub0lxp-STg.png](https://i.postimg.cc/63ZSk5fh/1-LXu-BIQBwor-Bt-Fub0lxp-STg.png)](https://postimg.cc/gXYNRp5w)
#### Here we use 'yeo-johnson' because it can handle the negetive and zero values

In [None]:
pt = PowerTransformer(method = 'yeo-johnson')
df_pt = pd.DataFrame(data =pt.fit_transform(df.drop('Outcome',axis =1)),columns = df.columns[:-1])
df_pt = df_pt.assign(Outcome = df['Outcome'])

In [None]:
for i in df_pt.columns:
    if(i != 'Outcome'):
        plt.figure(figsize = (14,4))
        plt.subplot(1,2,1)
        sns.histplot(x =df_pt[i], hue = df_pt['Outcome'])
        plt.title(i)
    
        plt.subplot(1,2,2)
        stats.probplot(x =df_pt[i], dist ='norm', plot = plt)               
    plt.show()

#### The Feature distribution are normalized. This shall help in faster convergence and better results for the logistice regression
#### Checking the skew for the transformed data

In [None]:
df_pt.skew()

In [None]:
df = df_pt

In [None]:
plt.figure(figsize = (16,8))
sns.heatmap(df.corr(),annot= True, cbar = False)

## Machine learning

#### Splitting the data. We will try to train and test all the models on the same sets

In [None]:
X = df.drop('Outcome', axis =1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### The below block is not necessary as the features had been transformed using power tranformer before

In [None]:
#st_sclr = StandardScaler()
#X_train = pd.DataFrame(st_sclr.fit_transform(X_train),columns = X_train.columns)
#X_test = pd.DataFrame(st_sclr.fit_transform(X_test),columns = X_test.columns)

#### Plotting the training and testing data

In [None]:
for i in X_train.columns:
    plt.figure(figsize = (14,4))
    plt.subplot(1,2,1)
    sns.histplot(X_train[i],color= 'Green')
    plt.subplot(1,2,2)
    sns.histplot(X_test[i])

plt.show()

In [None]:
df_mean_std_train = pd.DataFrame(data = [X_train.mean(), X_train.std()], columns = X_train.columns)
df_mean_std_test = pd.DataFrame(data = [X_test.mean(), X_test.std()], columns = X_test.columns)
df_mean_std_train.head()


In [None]:
df_mean_std_test.head()

#### The mean now is almost 0 and the st dev is close to 1 after scaling the input data
#### Generally for standard scaling is done on the training data separately, this is done to ensure we don't overfit

## Logistic Regression

#### Logistic regression is based on the sigmoid finction f(x) = 1/(1-exp(-x)). 
#### Based on what we enter as 'x' we get a probaility value between 0 and 1. 
#### Based on this value we fix a the threshold on the output (eg: 0.5) for a feature 
#### Below which the value is said to be classified as 0 and above which it is classified as 1

[![images.png](https://i.postimg.cc/v83LQw27/images.png)](https://postimg.cc/BtP1gk0t)

In [None]:
lrmodel = LogisticRegression()

lrmodel.fit(X_train,y_train)

lr_predict = lrmodel.predict(X_test)

#### getting the performance of the LR model

In [None]:
print(classification_report(y_test,lr_predict))

In [None]:
con_mat = confusion_matrix(y_test,lr_predict)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,lr_predict,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

## Stratified K-fold cross validation
#### Which is pretty good

#### Let's check the cross validation scores (Stratified K-fold cross validation).
#### Here we divide the whole data set into 6 splits and train the model 6 times, taking a different test set each time
#### We will then take the average of all the accuracy for all the runs
[![Illustration-of-the-K-Fold-Cross-Validation-Algorithm.jpg](https://i.postimg.cc/F1XhMBSY/Illustration-of-the-K-Fold-Cross-Validation-Algorithm.jpg)](https://postimg.cc/WqSRrSXV)
#### Here we are taking the entire set of the features and scaling them before applying the cross validation

In [None]:
skfold = StratifiedKFold(n_splits = 6)

In [None]:
model = LogisticRegression()
accuracy = cross_val_score(model, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

## K-nearest neighbours

#### As the name suggest this algorith basically classifies a point based on the class of the neighbouring points. It works with the following steps
#### 1. Calculate the distant of the point X (the point for which we will predict) to all points in the data 
#### 2. Sort the points in your data by increasing the distance form x
#### 3. Predict majority label of the k closest points (k is determined by the minimum error rate graph)

#### Note: For smaller k (k=1) we get a lot of variability and noise. 
#### Larger K leades to more bias and low variability, which helps form a boundary at the cost of mislabeling some points
#### This is not suited for categorical Features

[![0-It-VKiyx2-F3-ZU8z-V5.png](https://i.postimg.cc/NjYHTNnG/0-It-VKiyx2-F3-ZU8z-V5.png)](https://postimg.cc/tYSJGzy8)

#### Here if we take K (the number of neighbours) = 3 the '?' is classified as B but if we take K = 7 the same point would be classified as A.
#### We carry out this process for multiple values of K and select the value of K for which we get minimum number of deviations

In [None]:
knn = KNeighborsClassifier(n_neighbors =1)

knn.fit(X_train,y_train)
pred_knn= knn.predict(X_test)

#### Predicting results based on KNN with only one neighbor

In [None]:
print(classification_report(y_test,pred_knn))

#print(confusion_matrix(y_test,pred))

In [None]:
con_mat = confusion_matrix(y_test,pred_knn)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,pred_knn,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### number of neighbors = 1 yeild high errors. Thus we have to find the ideal number of neighbours to minimize errors

#### Here I take a empty list called error_rate, fit and train the KNN model for neghbours with 1 to 40 and append the mean number of mismatches between the predicted value and the actual labelled value. KNN is a supervised algorithm

In [None]:
error_rate = []

for i in range(1,40):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred= knn.predict(X_test)
    error_rate.append(np.mean(pred!= y_test))

In [None]:
sns.lineplot(x=range(1,40) ,y=error_rate,dashes=True)

#### k =30 seems to yeild resonable errors

In [None]:
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

#print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

In [None]:
con_mat = confusion_matrix(y_test,pred)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,pred,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### We obtain 74% accuracy from this model, which isn't enough. Lets go for the stratified K fold cross validation before check out decision trees & random forest

In [None]:
model = KNeighborsClassifier(n_neighbors=20)
accuracy = cross_val_score(model, X, y, cv=skfold)
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### There is a large variation in the accuracies obtained while the average accuracy is roughly 75%

## Decision trees and random forests
## Decision Trees
#### Decision Trees consists of root, decision and leaves. We consider the features and split the records on the basis of a feature.
[![1-z-Mu0-UClot-NXljrjqmy-RIHA.png](https://i.postimg.cc/zv09j2jm/1-z-Mu0-UClot-NXljrjqmy-RIHA.png)](https://postimg.cc/644j60gz)
#### We consider the purity (based on how clearly the classes are separated) of the daughter nodes after splitting. (Measured through Ginni and Entropy)
[![1-x5-W-NTWo-Ne-STex-V2-Ps-FICQ.png](https://i.postimg.cc/4N00x921/1-x5-W-NTWo-Ne-STex-V2-Ps-FICQ.png)](https://postimg.cc/1fpBWfQg)
#### The above is maximizing the information gain

In [None]:
dt= DecisionTreeClassifier()

dt.fit(X_train,y_train)

pred_dt = dt.predict(X_test)

In [None]:
print(classification_report(y_test,pred_dt))
#print(confusion_matrix(y_test,pred_dt))

In [None]:
con_mat = confusion_matrix(y_test,pred_dt)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,pred_dt,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### This isn't that much better.. maybe random forests will be better, as we have 768 records. 
#### Note: Also the above reults are not consistent.. I got much worse performance on my previous run.
#### let's do a cross validation to check the variation in accurcy

In [None]:
model = DecisionTreeClassifier()
accuracy = cross_val_score(model, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### As expected the mininum is only 69.79% accurate. This is beacuse Decision Trees are fammously prone to overfitting the training set

## Random Forest

#### This involves bagging of many of the decision trees (ensamble). Decision trees are suceptable to over fitting
#### In case of Random Forest we take the features and randomly select some of them to train our trees
#### The numner of random samples selected for traing trees (m) is the square root of the number of full set of the features (p)

[![images-1.png](https://i.postimg.cc/JhdmcvqS/images-1.png)](https://postimg.cc/McRhzsgY)

In [None]:
rfc = RandomForestClassifier()

rfc.fit(X_train,y_train)

pred_rfc = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,pred_rfc))

In [None]:
con_mat = confusion_matrix(y_test,pred_rfc)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,pred_rfc,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### This is much better. 
#### Lets try cross validation on this before moving on to SVM

In [None]:
model = RandomForestClassifier()
accuracy = cross_val_score(model, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### Though this is the highest average accuracy but there is a significant variation

## Random Search CV for the Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

### Parameter Description
#### bootstraping will enable each tree of the Forest to be trained on a subset of the features instead of all the features at once, this lowers variance
#### max_depth is the maximum number of levels for the trees
#### max_features is the number of features to consider when looking for the best split: If int, then consider max_features features at each split.
#### min_samples_leaf is the minimum number of samples required to be at a leaf node is similar to min_samples_splits, however, this describe the minimum number of samples of samples at the leafs
#### min_samples_split the minimum number of samples required to split an internal node
#### n_estimators is the number of individual trees in the forest

In [None]:
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
rfc = RandomForestClassifier()

rand_search_RF = RandomizedSearchCV(estimator = rfc, param_distributions = param_grid, verbose =  1)

rand_search_RF.fit(X_train,y_train)

In [None]:
rand_search_RF.best_params_

#### The results are as expect.. We will have to refit the data to a new model (rand_search_RF.best_estimator_) with the above parameters

In [None]:
rand_search_RF.best_estimator_

In [None]:
final_rfc= RandomForestClassifier(bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=800, max_features= 'auto')
final_rfc.fit(X_train,y_train)
final_pred = final_rfc.predict(X_test)

In [None]:
#print(confusion_matrix(y_test,final_pred))

print(classification_report(y_test,final_pred))

In [None]:
con_mat = confusion_matrix(y_test,final_pred)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,final_pred,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

In [None]:
accuracy = cross_val_score(final_rfc, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### Which is only slightly worse over the untuned SVM
#### Ok, Let's do the 'PCA', it is an attempt to reduce the number of Dimensions.

## SVM (Support Vector Machines)

#### Here we draw bounderies in n dimension (n= number of features) which separate the classes.
#### In case we cannot draw a clear boundary due to overlapping or classes being arranged in concentric circles, we take higher dimensions.
#### A higher dimension is a feature raised to a power plotted over a new axis (Using Kernals)

[![Support-Vector-Machine-visualization.png](https://i.postimg.cc/g0wc5GnG/Support-Vector-Machine-visualization.png)](https://postimg.cc/kVmdR3QH)


In [None]:
model = SVC()

model.fit(X_train, y_train)

svm_prediction = model.predict(X_test)

In [None]:
#print(confusion_matrix(y_test,svm_prediction))
print(classification_report(y_test,svm_prediction))

In [None]:
con_mat = confusion_matrix(y_test,svm_prediction)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,svm_prediction,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### I was hoping this would not be such a high accuracy. Lets check on cross validation and then check tuning the parameters using grid search improves the accuracy

In [None]:
model = SVC()
accuracy = cross_val_score(model, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### This has surprisingly given me the second highest accuracy and the least variation

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

#### C controls feedback on misclassification on the training data, thus it increases variability and lowers bias
#### gamma: it is a parameter in the kernel (which is rbf by default), smaller gamma means a larger variance. ie. a high gamma will increase bias and decrease variability
#### Low variablilty means the SVM is not doing a good job of fitting the training data (I think). So I think C should be higher and gamma lower than default

In [None]:
param_grid = {'C':[0.1,1,10,100,1000],'gamma': [1,0.01,0.001,0.0001]}

In [None]:
grid = GridSearchCV(SVC(), param_grid, verbose =  1)

grid.fit(X_train,y_train)

In [None]:
grid_pred = grid.predict(X_test)

In [None]:
#print(confusion_matrix(y_test,grid_pred))

print(classification_report(y_test,grid_pred))

In [None]:
con_mat = confusion_matrix(y_test,grid_pred)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,grid_pred,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

In [None]:
model = GridSearchCV(SVC(), param_grid, verbose =  1)
accuracy = cross_val_score(model, X, y, cv=skfold)
print()
print('max',accuracy.max(),' | ','min', accuracy.min(), ' | ','avg',accuracy.mean())

#### Which is only slightly worse than Random Search CV for Random Forest

## K means Clustering



#### K means is an unsupervised learning algorithm, used to divide data into distinct groups such that observations within each group are similar
#### Here the data set is labelled, but we are going to ignore the labels
#### First we have to choose K based on elbow method by plotting labels and SSE (here we will have 2 i.e. diabatic or non-diabatic )
#### 1. Assign each point to a cluster randomly
#### Until the clusters stop changing keep doing below:
#### 2. Calculate the centroid for each cluster by taking mean distance vectors of each point in the cluster
#### 3. Assign each data point to the cluster for which centroid is closest

[![images-2.png](https://i.postimg.cc/qMXGhTBB/images-2.png)](https://postimg.cc/KRjTwCgC)

In [None]:
kmeans =KMeans(n_clusters = 2)
kmeans.fit(X_test)

In [None]:
kmeans.cluster_centers_

In [None]:
km_pred = kmeans.labels_ 
print(classification_report(y_test,km_pred))

In [None]:
con_mat = confusion_matrix(y_test,km_pred)
plt.figure(figsize=(14,4)) 
plt.subplot(1,2,1)
sns.heatmap(con_mat/np.sum(con_mat)*100,annot =True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')

plt.subplot(1,2,2)
sns.heatmap(pd.DataFrame(classification_report(y_test,km_pred,output_dict=True)).T.iloc[:,:-1], annot = True)

plt.show()

#### As far a detecting deseases go we are interested in minimizing the false negetives. 
#### K Means provides us with lowest False negetive rates, though this comes at a cost of higher number of false positives

## PCA

In [None]:
# number of dimensions
print(df.shape[1] -1)

# name of dimensions
print(df.columns.drop('Outcome').values)

#### Lets center and scale the data, so that the mean is 0 and standard deviation is 1

In [None]:
std_scaler = StandardScaler()
scaled_data = std_scaler.fit_transform(df.drop('Outcome',axis =1))

In [None]:
scaled_feat= pd.DataFrame(scaled_data,columns=df.columns[:-1])
scaled_feat.head()

#### I have a feeling that I have done this before, but we do this again so as to compute the mean and standard variation for all the data set at once

#### Lets try finding 2 componets that contribute most to the end result

In [None]:
pca = PCA(n_components =2)

pca.fit(scaled_feat)

x_pca = pca.transform(scaled_feat)

scaled_feat.shape

In [None]:
x_pca.shape

#### the number of features have been reduced to 2

In [None]:
sns.scatterplot(x= x_pca[:,0],y =x_pca[:,1],hue = df['Outcome'])

#### There isn't a very clear distinction. Also, the first feature(x) and the second feature(y) donot relate 1 to 1 with the features that we had they are more like a combination
#### we could show this in a heat map

In [None]:
df_pca = pd.DataFrame(pca.components_,columns=df.columns[:-1])

In [None]:
sns.heatmap(df_pca,annot=True)

#### The lighter the color the more related the principal component is to the feature

#### You will have to download the note book and run it.. I am not able to save this with the graphs due to the 1 MB limit.. let me know if there is a way to work around this