In [None]:
# ========================
# 1. Import Libraries
# ========================
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style='whitegrid', palette='muted', font_scale=1.1)


# Paths
RAW = Path('data/raw')
PROC = Path('data/processed')
PROC.mkdir(parents=True, exist_ok=True)

In [None]:
# ========================
# 2. Load Raw Dataset
# ========================

# Path to the raw data (note the '..' since the notebook is inside /notebooks)
from pathlib import Path

RAW = Path('../data/raw')
file_path = RAW / 'Smart_Fertilizer_Recommender_Dataset.xlsx'

# Load dataset
df = pd.read_excel(file_path)
print('✅ Initial Shape:', df.shape)

# Preview first few rows
df.head()


In [None]:
# ========================
# 3. Handle Missing Values & Data Cleaning
# ========================


# Check missing values
print('Missing Values per Column:')
print(df.isnull().sum())


# Fill missing numeric values with median
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


# Fill categorical with mode
cat_cols = df.select_dtypes(exclude=np.number).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))


# Check again
print('After Cleaning:')
print(df.isnull().sum().sum(), 'missing values remain')

In [None]:
# ========================
# 4. Feature Engineering
# ========================

def feature_engineering(df):
    df = df.copy()
    
    # Derived ratios and indices
    df['NPK_Ratio'] = df['Nitrogen_Level'] / (df['Phosphorus_Level'] + df['Potassium_Level'] + 1e-6)
    df['Fertility_Index'] = (df['Organic_Carbon'] + (df['Moisture_Content'] / 100)) / 2
    df['Temperature_Rainfall_Index'] = df['Temperature_C'] / (df['Rainfall_mm'] + 1)
    
    # pH categorization
    df['pH_Category'] = pd.cut(
        df['Soil_pH'],
        bins=[0, 5.5, 6.5, 7.5, 14],
        labels=['Acidic', 'Slightly_Acidic', 'Neutral', 'Alkaline']
    )
    
    # Simplify fertilizer categories if needed
    df['Fertilizer_Type'] = df['Fertilizer_Type'].replace({
        'Complex': 'Mixed',
        'MOP': 'Potash'
    })
    
    return df


# Apply feature engineering
df = feature_engineering(df)
print('✅ Feature Engineering Complete. Columns:', len(df.columns))
df.head()


In [None]:
# ========================
# 5. Encoding Categorical Variables
# ========================

from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Crop_Type', 'Region', 'Soil_Type', 'pH_Category', 'Application_Timing']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # ensure dtype safety
    label_encoders[col] = le

print('✅ Categorical Encoding Complete.')
df[categorical_cols].head()


In [None]:
# ========================
# 6. Feature Scaling
# ========================


scaler = StandardScaler()
num_cols = df.select_dtypes(include=np.number).columns.drop(['Recommended_Quantity_kg_per_acre'])
df[num_cols] = scaler.fit_transform(df[num_cols])
print('Feature Scaling Complete.')

In [None]:
# ========================
# 7. Split Train/Validation/Test Sets
# ========================


train, test = train_test_split(df, test_size=0.15, random_state=42)
train, val = train_test_split(train, test_size=0.15, random_state=42)


train.to_csv(PROC / 'train.csv', index=False)
val.to_csv(PROC / 'val.csv', index=False)
test.to_csv(PROC / 'test.csv', index=False)


print('Data Splitting Complete:')
print('Train:', train.shape, '| Val:', val.shape, '| Test:', test.shape)

In [None]:
# ========================
# 8. Feature Importance Exploration (Optional)
# ========================


from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(n_estimators=100, random_state=42)


X = train.drop(['Recommended_Quantity_kg_per_acre','Fertilizer_Type'], axis=1)
y = train['Recommended_Quantity_kg_per_acre']


rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)


plt.figure(figsize=(10,6))
sns.barplot(x=importances.values[:15], y=importances.index[:15])
plt.title('Top 15 Feature Importances')
plt.show()

In [None]:
# ========================
# 9. Save Preprocessing Artifacts
# ========================
import joblib
from pathlib import Path

# Create 'models' folder if it doesn’t exist
models_dir = Path('../models')  # use ../ if your notebook is inside 'notebooks'
models_dir.mkdir(parents=True, exist_ok=True)

# Save artifacts
joblib.dump(scaler, models_dir / 'scaler.pkl')
joblib.dump(label_encoders, models_dir / 'label_encoders.pkl')

print(f'✅ Preprocessing artifacts saved in: {models_dir.resolve()}')
