In [None]:
# 01_eda.ipynb — Exploratory Data Analysis for Smart Fertilizer Recommender


# ========================
# 1. Import Libraries
# ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


# Style settings
sns.set(style='whitegrid', palette='muted', font_scale=1.1)

In [None]:
# ========================
# 2. Load Dataset
# ========================

# Step 1: Move up one level from 'notebooks' to project root, then into 'data/raw'
RAW = Path('../data/raw/Smart_Fertilizer_Recommender_Dataset.xlsx')

# Step 2: Load the Excel file
df = pd.read_excel(RAW)

# Step 3: Confirm successful load
print('✅ Data Loaded Successfully:', df.shape)

# Step 4: Display first few rows
df.head()


In [None]:
# ========================
# 3. Summary Statistics
# ========================
print('Summary Statistics:')
df.describe(include='all').transpose()

In [None]:
# ========================
# 4. Check for Missing Values
# ========================
missing = df.isnull().sum()
print('Missing Values per Column:')
print(missing[missing > 0])


plt.figure(figsize=(10,5))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# ========================
# 5. Univariate Analysis
# ========================

# Numeric distributions
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

# Boxplots for detecting outliers
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:
# ========================
# 6. Bivariate Analysis
# ========================

# Correlation heatmap — numeric only
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=np.number)  # Select only numeric columns
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap (Numerical Features)')
plt.show()


# Example: Soil_pH vs Fertilizer_Type
if 'Fertilizer_Type' in df.columns and 'Soil_pH' in df.columns:
    plt.figure(figsize=(8, 5))
    sns.boxplot(data=df, x='Fertilizer_Type', y='Soil_pH')
    plt.title('Soil pH vs Fertilizer Type')
    plt.show()
else:
    print("Columns 'Fertilizer_Type' or 'Soil_pH' not found in the dataset.")


# Example: Nitrogen_Level vs Recommended Quantity
if all(col in df.columns for col in ['Nitrogen_Level', 'Recommended_Quantity_kg_per_acre', 'Fertilizer_Type']):
    plt.figure(figsize=(8, 5))
    sns.scatterplot(
        data=df,
        x='Nitrogen_Level',
        y='Recommended_Quantity_kg_per_acre',
        hue='Fertilizer_Type',
        palette='viridis'
    )
    plt.title('Nitrogen vs Fertilizer Quantity by Type')
    plt.show()
else:
    print("Some columns for this plot are missing in the dataset.")


In [None]:
# ========================
# 7. Insights Summary
# ========================


print('Key Observations:')
print('- Check if soil pH clusters around neutral for certain fertilizer types.')
print('- Observe which nutrient (N, P, K) correlates most with fertilizer choice.')
print('- Note any extreme outliers in rainfall or nutrient levels that may need capping.')
print('- Examine feature correlations to decide which variables might be redundant.')