<a href="https://colab.research.google.com/github/SarkarPriyanshu/DataScienceAssign/blob/main/Task_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### `Task` How dimensionality reduction using Principal Component Analysis (PCA) on the Wine Quality dataset contributes to improving the classification accuracy and efficiency of wine type.

Note : Use KNN for Classification.

Data Link :  [Wine Data](https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv)

In [3]:
# Data Loading
import pandas as pd
wine_data_path = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv"
wine = pd.read_csv(wine_data_path)

In [2]:
wine.shape

(6497, 13)

In [8]:
wine.head(5)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [52]:
wine.isna().sum()

fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
type_white               0
dtype: int64

In [53]:
wine.dropna(inplace=True)

In [6]:
wine.type.unique()

array(['white', 'red'], dtype=object)

In [9]:
wine = pd.get_dummies(wine, prefix_sep='_', columns=['type'], drop_first=True, dtype=int)

In [22]:
2138/wine.shape[0] * 100

32.907495767277204

In [17]:
dict(wine.quality.value_counts())

{6: 2836, 5: 2138, 7: 1079, 4: 216, 8: 193, 3: 30, 9: 5}

In [16]:
from sklearn.utils import resample

In [54]:
# This function is to handle the imbalance class
def rebalancedata(df=None,target=None,sample_count=2000):
    keys_to_handle = list()
    keys_not_to_handle = list()

    value_counts = dict(df[target].value_counts())

    # This will seperate the values that are need to be resampled
    for key,value in value_counts.items():
      if value/df.shape[0] * 100 < 30:
        keys_to_handle.append(key)  # this keys are less data points (minority points)
      else:
        keys_not_to_handle.append(key) # this keys are high data points (majority points)

    for indx in range(len(keys_to_handle)):
      # we just seperating out the data points in terms of target minoity class to resample it to match majority class
      keys_to_handle[indx] = df.loc[df[target]==keys_to_handle[indx]]

    for indx in range(len(keys_not_to_handle)):
      # we just seperating out the majority class
      keys_not_to_handle[indx] = df.loc[df[target]==keys_not_to_handle[indx]]

    # Here we are resampling the minority class to match majority class
    for indx in range(len(keys_to_handle)):
        keys_to_handle[indx] = resample(keys_to_handle[indx], replace=True, n_samples=sample_count, random_state=42)

    new_bank_df = pd.concat([*keys_to_handle, *keys_not_to_handle])
    return new_bank_df

new_wine = rebalancedata(wine,'quality')

In [55]:
print(new_wine.quality.value_counts(),wine.quality.value_counts())

quality
6    2820
5    2128
7    2000
4    2000
8    2000
3    2000
9    2000
Name: count, dtype: int64 quality
6    2820
5    2128
7    1074
4     214
8     192
3      30
9       5
Name: count, dtype: int64


In [117]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score,classification_report

In [57]:
X = new_wine.drop('quality',axis=1)
y = new_wine['quality']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
X_train.shape, X_test.shape

((10463, 12), (4485, 12))

In [60]:
sc = StandardScaler()
X_train_tsf = sc.fit_transform(X_train,y_train)
X_test_tsf = sc.transform(X_test)

In [63]:
f1_score_train = dict()
f1_score_test = dict()

for n_neighbor in range(1,15):
  Knc = KNeighborsClassifier(n_neighbors=n_neighbor)
  Knc.fit(X_train_tsf,y_train)
  X_train_pred = Knc.predict(X_train_tsf)
  f1_score_train[n_neighbor] = f1_score(y_train,X_train_pred,average='weighted')
  X_test_pred = Knc.predict(X_test_tsf)
  f1_score_test[n_neighbor] = f1_score(y_test,X_test_pred,average='weighted')



In [72]:
import plotly.graph_objects as go
import plotly.express as px

neighbors = list(range(1, 16))

# Create figure
fig = go.Figure()

# Add training F1 score line
fig.add_trace(go.Scatter(x=neighbors, y=list(f1_score_train.values()), mode='lines', name='Training F1 Score', line=dict(color='blue')))

# Add test F1 score line
fig.add_trace(go.Scatter(x=neighbors, y=list(f1_score_test.values()), mode='lines', name='Test F1 Score', line=dict(color='red')))

# Update layout
fig.update_layout(title='Training and Test F1 Scores Across Different Number of Neighbors',
                  xaxis_title='Number of Neighbors',
                  yaxis_title='F1 Score',
                  legend=dict(x=0, y=1, traceorder='normal'))

# Show figure
fig.show()

In [73]:
from sklearn.decomposition import PCA

In [75]:
pca = PCA(n_components=None)

X_train_tsf_pca = pca.fit_transform(X_train_tsf)

In [84]:
import numpy as np
import plotly.graph_objects as go

In [102]:
import plotly.express as px

cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

fig = px.line(x=list(range(1, len(cumulative_variance_ratio) + 1)),
              y=cumulative_variance_ratio,
              title='Cumulative Sum of Variance of Eigenvalues')

# Update axes labels
fig.update_xaxes(title_text='Number of Features')
fig.update_yaxes(title_text='Cumulative Sum of Variance of Eigenvalues')

# Show plot
fig.show()

In [110]:
pd.concat([pd.Series(X.columns)
        ,pd.Series(pca.explained_variance_)
        ,pd.Series(np.cumsum(pca.explained_variance_ratio_))
           ],axis=1)\
           .rename(columns={0:'Feature Names',1:'Explained Variance',2:'Cumilative Variance'})

Unnamed: 0,Feature Names,Explained Variance,Cumilative Variance
0,fixed acidity,3.514802,0.292872
1,volatile acidity,2.662458,0.514722
2,citric acid,1.579352,0.646323
3,residual sugar,1.132551,0.740693
4,chlorides,0.731289,0.801628
5,free sulfur dioxide,0.662171,0.856803
6,total sulfur dioxide,0.549613,0.9026
7,density,0.490801,0.943496
8,pH,0.30613,0.969005
9,sulphates,0.221215,0.987437


In [111]:
pca = PCA(n_components=7)

X_train_tsf_pca = pca.fit_transform(X_train_tsf)
X_test_tsf_pca = pca.transform(X_test_tsf)

In [114]:
f1_score_train = dict()
f1_score_test = dict()

for n_neighbor in range(1,15):
  Knc = KNeighborsClassifier(n_neighbors=n_neighbor)
  Knc.fit(X_train_tsf_pca,y_train)
  X_train_pred = Knc.predict(X_train_tsf_pca)
  f1_score_train[n_neighbor] = f1_score(y_train,X_train_pred,average='weighted')
  X_test_pred = Knc.predict(X_test_tsf_pca)
  f1_score_test[n_neighbor] = f1_score(y_test,X_test_pred,average='weighted')



In [115]:
neighbors = list(range(1, 16))

# Create figure
fig = go.Figure()

# Add training F1 score line
fig.add_trace(go.Scatter(x=neighbors, y=list(f1_score_train.values()), mode='lines', name='Training F1 Score', line=dict(color='blue')))

# Add test F1 score line
fig.add_trace(go.Scatter(x=neighbors, y=list(f1_score_test.values()), mode='lines', name='Test F1 Score', line=dict(color='red')))

# Update layout
fig.update_layout(title='Training and Test F1 Scores Across Different Number of Neighbors',
                  xaxis_title='Number of Neighbors',
                  yaxis_title='F1 Score',
                  legend=dict(x=0, y=1, traceorder='normal'))

# Show figure
fig.show()

In [116]:
Knc = KNeighborsClassifier(n_neighbors=3)
Knc.fit(X_train_tsf_pca,y_train)
X_train_pred = Knc.predict(X_train_tsf_pca)
print('Traning F1 score: ',f1_score(y_train,X_train_pred,average='weighted'))
X_test_pred = Knc.predict(X_test_tsf_pca)
print('Testing F1 score: ',f1_score(y_test,X_test_pred,average='weighted'))

Traning F1 score:  0.9049865556592659
Testing F1 score:  0.8010542233595831


In [120]:
print(classification_report(y_train,X_train_pred))

              precision    recall  f1-score   support

           3       0.99      1.00      1.00      1426
           4       0.91      1.00      0.95      1388
           5       0.81      0.77      0.79      1504
           6       0.84      0.74      0.79      1968
           7       0.85      0.91      0.88      1403
           8       0.96      1.00      0.98      1393
           9       1.00      1.00      1.00      1381

    accuracy                           0.91     10463
   macro avg       0.91      0.92      0.91     10463
weighted avg       0.91      0.91      0.90     10463



In [119]:
print(classification_report(y_test,X_test_pred))

              precision    recall  f1-score   support

           3       0.97      1.00      0.99       574
           4       0.87      1.00      0.93       612
           5       0.60      0.57      0.59       624
           6       0.64      0.49      0.56       852
           7       0.71      0.74      0.73       597
           8       0.87      1.00      0.93       607
           9       1.00      1.00      1.00       619

    accuracy                           0.81      4485
   macro avg       0.81      0.83      0.82      4485
weighted avg       0.80      0.81      0.80      4485

