# 01 – Preprocess Sustainable Agriculture Data

> Kaggle → Jupyter → Clean Artifacts

In [None]:
# If running for the first time, ensure required libs are installed in your env:
# !pip install -r ../requirements.txt

## 1) Imports

In [5]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

RAW_DIR = Path("../data/raw")
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 100)

## 2) Download from Kaggle

In [2]:

KAGGLE_DATASET = "SupravaTalukdar/crop-production-in-india"

if KAGGLE_DATASET != "SupravaTalukdar/crop-production-in-india":
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi(); api.authenticate()
    api.dataset_download_files(KAGGLE_DATASET, path=str(RAW_DIR), unzip=True)
    print("Downloaded to", RAW_DIR.resolve())
else:
    print("⚠️ Update KAGGLE_DATASET before running this cell.")

⚠️ Update KAGGLE_DATASET before running this cell.


## 3) Load raw CSV(s)

In [7]:
import os

# 1. Where is Jupyter running from?
print("Current working directory:", os.getcwd())

# 2. What files/folders exist here?
print("Files in current directory:", os.listdir())


Current working directory: C:\Users\Suprava\Downloads\sustainable-agri-starter\sustainable-agri-preprocess\notebooks
Files in current directory: ['.gitkeep', '.ipynb_checkpoints', '01_preprocess.ipynb']


In [16]:
import pandas as pd

# Use raw string (r"...") so Windows backslashes don't break
raw_csv_path = r"C:\Users\Suprava\Downloads\sustainable-agri-starter\sustainable-agri-preprocess\data\raw\crop_production.csv"

df = pd.read_csv(raw_csv_path)

print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (246091, 7)


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


## 4) Basic audits

In [17]:
if 'df' in globals():
    # 🔹 Quick numeric summary
    display(df.describe(include='number').T)

    # 🔹 Missing values check
    na_rate = df.isna().mean().sort_values(ascending=False)
    display(na_rate[na_rate > 0])
    
    # 🔹 Unique values in categorical columns
    for col in ['State_Name', 'District_Name', 'Season', 'Crop']:
        print(f"\nUnique values in {col}: {df[col].nunique()}")


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Crop_Year,246091.0,2005.643018,4.952164,1997.0,2002.0,2006.0,2010.0,2015.0
Area,246091.0,12002.820864,50523.4,0.04,80.0,582.0,4392.0,8580100.0
Production,242361.0,582503.442251,17065810.0,0.0,88.0,729.0,7023.0,1250800000.0


Production    0.015157
dtype: float64


Unique values in State_Name: 33

Unique values in District_Name: 646

Unique values in Season: 6

Unique values in Crop: 124


In [19]:
import os

# Ensure processed folder exists
processed_path = r"C:\Users\Suprava\Downloads\sustainable-agri-starter\sustainable-agri-preprocess\data\processed"
os.makedirs(processed_path, exist_ok=True)

# Save cleaned dataset
clean_file = os.path.join(processed_path, "crop_production_clean.csv")
df.to_csv(clean_file, index=False)

print(f"✅ Clean dataset saved at: {clean_file}")


✅ Clean dataset saved at: C:\Users\Suprava\Downloads\sustainable-agri-starter\sustainable-agri-preprocess\data\processed\crop_production_clean.csv


## 5) Feature selection + preprocessing pipeline

In [None]:
if 'df' in globals():
    # Example: select target and features (customize!)
    # target = "yield"  # e.g., crop yield
    # features = [c for c in df.columns if c != target]
    target = None  # <-- set this
    features = []  # <-- set this

    if target is None or not features:
        print("⚠️ Set `target` and `features` according to your dataset.")
    else:
        X = df[features].copy()
        y = df[target].copy()

        num_cols = X.select_dtypes(include=['number']).columns.tolist()
        cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()

        numeric_pipe = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])

        categorical_pipe = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ])

        pre = ColumnTransformer(
            transformers=[
                ("num", numeric_pipe, num_cols),
                ("cat", categorical_pipe, cat_cols),
            ]
        )

        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        print(f"Train: {X_train.shape}, Valid: {X_valid.shape}, Test: {X_test.shape}")

## 6) Fit transform and persist processed artifacts

In [None]:
from sklearn.pipeline import Pipeline
import joblib

if 'df' in globals() and 'pre' in globals() and 'X_train' in globals():
    pre.fit(X_train)
    Xt_train = pre.transform(X_train)
    Xt_valid = pre.transform(X_valid)
    Xt_test  = pre.transform(X_test)

    # Save processed splits as CSV (dense)
    # If OHE expands many columns, consider saving as parquet or sparse matrices.
    proc_train = pd.DataFrame(Xt_train.todense() if hasattr(Xt_train, "todense") else Xt_train)
    proc_valid = pd.DataFrame(Xt_valid.todense() if hasattr(Xt_valid, "todense") else Xt_valid)
    proc_test  = pd.DataFrame(Xt_test.todense()  if hasattr(Xt_test, "todense")  else Xt_test)

    proc_train['target'] = y_train.to_numpy()
    proc_valid['target'] = y_valid.to_numpy()
    proc_test['target']  = y_test.to_numpy()

    proc_train.to_csv(PROC_DIR / "train.csv", index=False)
    proc_valid.to_csv(PROC_DIR / "valid.csv", index=False)
    proc_test.to_csv(PROC_DIR / "test.csv", index=False)

    # Persist the preprocessor for reuse
    joblib.dump(pre, PROC_DIR / "preprocessor.joblib")
    print("Saved processed files to:", PROC_DIR.resolve())
else:
    print("⚠️ Ensure previous cells are configured and run.")