In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load the dataset
data = pd.read_csv('/content/winequality-red.csv')

# Display the first few rows of the dataset
print(data.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [3]:
# Separate features and target variable
X = data.drop('quality', axis=1)  # Features
y = data['quality']               # Target variable


In [4]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [5]:
# Apply PCA to reduce dimensions
pca = PCA(n_components=2)  # Reducing to 2 components for simplicity
X_pca = pca.fit_transform(X_scaled)

# Convert PCA results to DataFrame
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
print(pca_df.head())


        PC1       PC2
0 -1.619530  0.450950
1 -0.799170  1.856553
2 -0.748479  0.882039
3  2.357673 -0.269976
4 -1.619530  0.450950


In [6]:
# Explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_)}")


Explained variance ratio: [0.28173931 0.1750827 ]
Total explained variance: 0.4568220118429409


In [7]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Get feature importance scores
feature_importance = model.feature_importances_

# Create a ranking of features
feature_ranking = pd.Series(feature_importance, index=X.columns).sort_values(ascending=False)

print("Feature Ranking:\n", feature_ranking)


Feature Ranking:
 alcohol                 0.151698
sulphates               0.112402
total sulfur dioxide    0.103722
volatile acidity        0.100606
density                 0.088905
chlorides               0.078535
fixed acidity           0.075535
pH                      0.075392
citric acid             0.074150
residual sugar          0.072882
free sulfur dioxide     0.066172
dtype: float64
