In [None]:
# 📊 EDA and Correlation
import seaborn as sns
import matplotlib.pyplot as plt

full_data = pd.read_csv('3_direct_encoded.csv')
data_v1 = full_data.iloc[:, 2:]  # Remove ID and Parameters columns

# Basic info
print(data_v1.info())
print(data_v1.describe())

# Heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(data_v1.corr(), annot=True, fmt=".3f")
plt.title("Correlation Heatmap")
plt.show()

# Pairplot
sns.pairplot(data_v1)
plt.show()

# Boxplot
plt.figure(figsize=(14, 6))
sns.boxplot(data=data_v1)
plt.xticks(rotation=90)
plt.title("Boxplot for Features")
plt.show()

# 📌 Mutual Information Regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
import numpy as np

target_vars = ["Seed Yield per Unit Area (SYUA)", "Sugars (Su)", "Protein Content (PCO)", "Number of Seeds per Pod (NSP)"]

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_v1), columns=data_v1.columns)
target_frame = data_scaled[target_vars]
features = data_scaled.drop(columns=target_vars)

mi_scores = {}
for target in target_vars:
    mi = mutual_info_regression(features, data_scaled[target])
    mi_scores[target] = mi

# Plot MI
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()
for i, target in enumerate(target_vars):
    sorted_idx = np.argsort(mi_scores[target])[::-1]
    axes[i].barh(features.columns[sorted_idx], np.array(mi_scores[target])[sorted_idx])
    axes[i].set_title(f"Mutual Info with {target}")
    axes[i].invert_yaxis()
plt.tight_layout()
plt.show()
