# EDA â€” Soil & Fertilizer

Run the cells in order. Each cell is one step.

## Load Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = "../data/data/raw/data_core.csv"

def load_data(path):
    df = pd.read_csv(path)
    return df

df = load_data(DATA_PATH)
print("Data loaded.", df.shape)

## 1. Dataset Overview

In [None]:
def dataset_overview(df):
    print("\nðŸ”¹ Dataset Shape:", df.shape)
    print("\nðŸ”¹ Column Names:")
    print(df.columns.tolist())
    print("\nðŸ”¹ Data Types:")
    print(df.dtypes)
    print("\nðŸ”¹ Sample Data:")
    print(df.head())

dataset_overview(df)

## 2. Data Quality Checks

In [None]:
def data_quality_checks(df):
    print("\nðŸ”¹ Missing Values:")
    print(df.isnull().sum())
    print("\nðŸ”¹ Duplicate Rows:", df.duplicated().sum())
    print("\nðŸ”¹ Statistical Summary:")
    print(df.describe())

data_quality_checks(df)

## 3. Categorical Analysis

In [None]:
def categorical_analysis(df):
    categorical_cols = ["Soil Type", "Crop Type", "Fertilizer Name"]
    for col in categorical_cols:
        print(f"\nðŸ”¹ Value counts for {col}:")
        print(df[col].value_counts())

categorical_analysis(df)

## 4. Numerical Distributions

In [None]:
def numerical_distribution(df):
    numeric_cols = [
        "Temperature", "Humidity", "Moisture",
        "Nitrogen", "Potassium", "Phosphorous"
    ]
    df[numeric_cols].hist(figsize=(12, 8), bins=20)
    plt.suptitle("Numerical Feature Distributions", fontsize=14)
    plt.tight_layout()
    plt.show()

numerical_distribution(df)

## 5. Correlation Analysis

In [None]:
def correlation_analysis(df):
    numeric_df = df.select_dtypes(include=["float64", "int64"])
    plt.figure(figsize=(10, 6))
    sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.show()

correlation_analysis(df)