In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro

# Load the Wine Quality dataset
wine_data = pd.read_csv('winequality-red.csv')  # Replace with the path to your dataset

# Q1: Key features of the Wine Quality dataset
key_features = wine_data.columns
feature_importance = """
The key features of the wine quality dataset include:
1. Fixed Acidity: Affects the taste and overall quality. High acidity can make the wine taste sour.
2. Volatile Acidity: High levels can indicate spoilage. It impacts the wine's flavor and quality.
3. Citric Acid: Adds freshness to the wine and helps balance the acidity.
4. Residual Sugar: Affects sweetness. Higher levels can lead to a sweeter taste.
5. Chlorides: Contributes to saltiness. High levels can impact the flavor negatively.
6. Free Sulfur Dioxide: Acts as a preservative. Helps prevent oxidation and spoilage.
7. Total Sulfur Dioxide: Total amount of sulfur compounds. High levels may affect taste and quality.
8. Density: Indicates the amount of dissolved substances. Higher density can affect the mouthfeel.
9. pH: Measures acidity. The balance of pH impacts the wine's taste and preservation.
10. Sulphates: Contributes to the wine's stability and taste.
11. Alcohol: Influences the body and taste. Higher alcohol content can enhance flavors and balance acidity.
12. Quality: The target variable representing the quality rating of the wine.
"""

print("Q1: Key Features of the Wine Quality Dataset")
print(feature_importance)

# Q2: Handling missing data and imputation techniques
# Check for missing values
missing_data = wine_data.isnull().sum()
print("Missing Data:")
print(missing_data)

# Impute missing data
imputer = SimpleImputer(strategy='mean')  # Using mean imputation as an example
wine_data_imputed = pd.DataFrame(imputer.fit_transform(wine_data), columns=wine_data.columns)

imputation_summary = """
Different imputation techniques:
1. Mean Imputation: Simple and works well if data is missing at random but can introduce bias.
2. Median Imputation: Useful if data is skewed. Less sensitive to outliers.
3. Mode Imputation: Suitable for categorical data. Replaces missing values with the most frequent value.
4. K-Nearest Neighbors Imputation: Uses similar data points to impute missing values but can be computationally expensive.
5. Predictive Imputation: Uses a model to predict missing values. More complex but can be more accurate.
"""

print("\nQ2: Handling Missing Data and Imputation Techniques")
print(imputation_summary)

# Q3: Factors affecting students' performance
factors = """
Key factors affecting students' performance include:
1. Study Hours: More study time generally improves performance.
2. Attendance: Regular attendance is often associated with better performance.
3. Family Background: Parental support and education level can impact academic achievement.
4. Health: Good health can lead to better focus and performance.
5. Extracurricular Activities: Participation can either positively or negatively impact academic performance.
6. Stress Levels: High stress can negatively affect performance.
"""

print("\nQ3: Factors Affecting Students' Performance")
print(factors)

# Q4: Feature engineering for student performance dataset
# Example: Transforming study hours into categorical bins
student_data = pd.read_csv('student_performance.csv')  # Replace with the path to your dataset
student_data['Study Hours'] = pd.cut(student_data['Study Hours'], bins=[0, 2, 4, 6, 8, 10], labels=['0-2', '2-4', '4-6', '6-8', '8-10'])

print("\nQ4: Feature Engineering for Student Performance Dataset")
print(student_data.head())

# Q5: Exploratory Data Analysis (EDA) for the Wine Quality dataset
# Distribution of each feature
plt.figure(figsize=(15, 10))
for i, col in enumerate(wine_data.columns):
    plt.subplot(4, 3, i+1)
    sns.histplot(wine_data[col], kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()

# Check normality using Shapiro-Wilk test
normality_tests = {col: shapiro(wine_data[col].dropna())[1] for col in wine_data.columns}
non_normal_features = [col for col, p in normality_tests.items() if p < 0.05]

normality_summary = """
Features exhibiting non-normality:
{}
Possible transformations to improve normality:
1. Log Transformation: Useful for right-skewed distributions.
2. Square Root Transformation: Helps reduce right skewness.
3. Box-Cox Transformation: Suitable for various types of non-normality.
4. Yeo-Johnson Transformation: Handles both positive and negative skewness.
""".format(non_normal_features)

print("\nQ5: Exploratory Data Analysis (EDA) and Normality Testing")
print(normality_summary)

# Q6: Apply PCA for dimensionality reduction
scaler = StandardScaler()
scaled_data = scaler.fit_transform(wine_data_imputed.drop('quality', axis=1))  # Scale features
pca = PCA()
pca_result = pca.fit_transform(scaled_data)
explained_variance_ratio = pca.explained_variance_ratio_

# Number of principal components to retain
cumulative_variance = np.cumsum(explained_variance_ratio)
num_components = np.argmax(cumulative_variance >= 0.95) + 1

pca_summary = """
PCA Results:
- Minimum number of principal components needed to explain 95% of the variance: {}
""".format(num_components)

print("\nQ6: PCA for Dimensionality Reduction")
print(pca_summary)
