# PCA Analysis Notebook
This notebook performs PCA on the `StudentPerformanceFactors.csv` dataset and identifies the top contributing features to each principal component.

In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
# 1. Load the data
df = pd.read_csv("original_dataset.csv")
# 2. Separate features and target
X = df.drop(columns="Exam_Score")
y = df["Exam_Score"]

In [4]:
# 3. Drop categorical variables
X = pd.get_dummies(X, drop_first=True)

In [5]:
# 4. Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# 5. Fit PCA to explain 95% variance
pca = PCA(n_components=0.95, random_state=0)
X_pca = pca.fit_transform(X_scaled)

In [7]:
# 6. Print PCA summary
print(f"Number of components selected: {pca.n_components_}\n")
print("Explained variance ratio per component:")
for i, var in enumerate(pca.explained_variance_ratio_, start=1):
    print(f"  PC{i}: {var:.3f}")
cum_var = pca.explained_variance_ratio_.cumsum()[-1]
print(f"\nCumulative variance explained: {cum_var:.3f}\n")

Number of components selected: 23

Explained variance ratio per component:
  PC1: 0.067
  PC2: 0.063
  PC3: 0.062
  PC4: 0.061
  PC5: 0.057
  PC6: 0.055
  PC7: 0.055
  PC8: 0.051
  PC9: 0.039
  PC10: 0.039
  PC11: 0.038
  PC12: 0.038
  PC13: 0.037
  PC14: 0.037
  PC15: 0.037
  PC16: 0.036
  PC17: 0.036
  PC18: 0.035
  PC19: 0.035
  PC20: 0.022
  PC21: 0.019
  PC22: 0.018
  PC23: 0.018

Cumulative variance explained: 0.955



In [8]:
# 7. Feature loadings
loadings = pd.DataFrame(
    pca.components_.T,
    index=X.columns,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)
print("Feature loadings (first 10 features):")
print(loadings.head(10), "\n")

Feature loadings (first 10 features):
                                  PC1       PC2       PC3       PC4       PC5  \
Hours_Studied                0.015002 -0.008046 -0.021822  0.021916 -0.023534   
Attendance                  -0.024803  0.057647  0.000062 -0.022215  0.024413   
Sleep_Hours                  0.004139  0.021773 -0.010846 -0.031394 -0.014157   
Previous_Scores             -0.018759  0.018525  0.003571 -0.031488  0.047737   
Tutoring_Sessions           -0.020172 -0.006416  0.020828 -0.005635  0.027404   
Physical_Activity           -0.004973 -0.037964 -0.001548  0.008778  0.022944   
Parental_Involvement_Low     0.045945  0.149656 -0.007129  0.138292  0.442994   
Parental_Involvement_Medium -0.054078 -0.134979 -0.001368 -0.137417 -0.418217   
Access_to_Resources_Low      0.017803  0.011887  0.051499 -0.128346 -0.470562   
Access_to_Resources_Medium  -0.027359 -0.042768 -0.052588  0.110561  0.472222   

                                  PC6       PC7       PC8       PC9   

In [9]:
# 8. Correlation with target
corrs = np.corrcoef(X_pca.T, y)[-1, :-1]
corr_df = pd.Series(corrs, index=loadings.columns)
print("Correlation of each PC with Exam_Score:")
print(corr_df.sort_values(key=lambda x: x.abs(), ascending=False), "\n")

Correlation of each PC with Exam_Score:
PC16    0.484806
PC10   -0.311319
PC15    0.281964
PC18    0.261738
PC23   -0.220699
PC17   -0.160289
PC19    0.150726
PC12   -0.148232
PC11   -0.141386
PC9     0.137818
PC22   -0.112298
PC14    0.109254
PC7     0.098727
PC20   -0.084925
PC4     0.084360
PC6     0.078936
PC1    -0.076544
PC3    -0.034083
PC5     0.033313
PC2    -0.023827
PC21   -0.016470
PC13    0.007112
PC8     0.002482
dtype: float64 



In [10]:
# 9. Top 3 features per PC
top3 = {}
for pc in loadings.columns:
    top_feats = (
        loadings[pc]
        .abs()
        .sort_values(ascending=False)
        .head(3)
        .index
        .tolist()
    )
    top3[pc] = top_feats
top3_df = pd.DataFrame.from_dict(
    top3, orient="index", columns=["Top1", "Top2", "Top3"]
)
print("Top 3 features contributing to each principal component:")
print(top3_df)

Top 3 features contributing to each principal component:
                                       Top1  \
PC1             Distance_from_Home_Moderate   
PC2                       Family_Income_Low   
PC3                    Motivation_Level_Low   
PC4                 Peer_Influence_Positive   
PC5              Access_to_Resources_Medium   
PC6   Parental_Education_Level_Postgraduate   
PC7             Parental_Involvement_Medium   
PC8                  Teacher_Quality_Medium   
PC9                              Attendance   
PC10              Learning_Disabilities_Yes   
PC11                     School_Type_Public   
PC12                    Internet_Access_Yes   
PC13                      Physical_Activity   
PC14                            Sleep_Hours   
PC15                      Tutoring_Sessions   
PC16                      Tutoring_Sessions   
PC17                    Internet_Access_Yes   
PC18                             Attendance   
PC19                        Previous_Scores   
PC2