# Training Models


## Model 1:
## Crop prediction model

### Import modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings('ignore')
print("‚úÖ Imports ready!")


‚úÖ Imports ready!


In [None]:
import os
print(os.getcwd())
print(os.listdir())


/content
['.config', 'crop_prediction_data.csv', 'sample_data']


In [None]:
from google.colab import files
uploaded = files.upload()

Saving crop_prediction_data.csv to crop_prediction_data.csv


In [None]:
# Load your specific dataset
df = pd.read_csv("/content/crop_prediction_data.csv")
df.columns = df.columns.str.strip()
print("Columns in dataset:", df.columns.tolist())
print(f"There are {df.shape[0]} data points(rows) and {df.shape[1]} features(columns)")
df.head()

Columns in dataset: ['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name', 'Crop', 'Area_ha', 'Yield_kg_per_ha', 'N_req_kg_per_ha', 'P_req_kg_per_ha', 'K_req_kg_per_ha', 'Total_N_kg', 'Total_P_kg', 'Total_K_kg', 'Temperature_C', 'Humidity_%', 'pH', 'Rainfall_mm', 'Wind_Speed_m_s', 'Solar_Radiation_MJ_m2_day']
There are 50765 data points(rows) and 20 features(columns)


Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,Crop,Area_ha,Yield_kg_per_ha,N_req_kg_per_ha,P_req_kg_per_ha,K_req_kg_per_ha,Total_N_kg,Total_P_kg,Total_K_kg,Temperature_C,Humidity_%,pH,Rainfall_mm,Wind_Speed_m_s,Solar_Radiation_MJ_m2_day
0,1,1966,14,Chhattisgarh,Durg,rice,548000.0,337.59,8.43975,4.05108,7.42698,4624983.0,2219991.84,4069985.04,25,80,6.5,1200,2.0,18
1,1,1966,14,Chhattisgarh,Durg,maize,3000.0,666.67,18.00009,8.00004,11.33339,54000.27,24000.12,34000.17,22,70,6.0,800,2.5,20
2,1,1966,14,Chhattisgarh,Durg,chickpea,54000.0,500.0,9.0,5.0,9.0,486000.0,270000.0,486000.0,20,60,6.5,600,1.5,16
3,1,1967,14,Chhattisgarh,Durg,rice,547000.0,747.71,18.69275,8.97252,16.44962,10224934.25,4907968.44,8997942.14,25,80,6.5,1200,2.0,18
4,1,1967,14,Chhattisgarh,Durg,maize,3000.0,1000.0,27.0,12.0,17.0,81000.0,36000.0,51000.0,22,70,6.0,800,2.5,20


In [None]:
df['Crop'].unique()

In [None]:
# TARGET: We want to recommend the 'Crop'
target_col = 'Crop'

# FEATURES: Use N/P/K requirement columns + Weather + pH
# Note: We use 'N_req_kg_per_ha' not 'Total_N_kg' because 'Total' depends on Area which varies.
numeric_features = [
    'N_req_kg_per_ha',
    'P_req_kg_per_ha',
    'K_req_kg_per_ha',
    'Temperature_C',
    'Humidity_%',
    'pH',
    'Rainfall_mm'
]

# If you want to use State/District as features (Optional but recommended for location bias)
categorical_features = ['State Name']

print(f"Numeric Features: {numeric_features}")
print(f"Categorical Features: {categorical_features}")

# Create X and y
X = df.drop(columns=["Crop","Yield_kg_per_ha","Total_N_kg","Area_ha","Total_P_kg","Total_K_kg"])
y = df["Crop"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Data: {X_train.shape}")
print(f"Testing Data: {X_test.shape}")
X['Dist Name'].nunique()
X['State Name'].nunique()

Numeric Features: ['N_req_kg_per_ha', 'P_req_kg_per_ha', 'K_req_kg_per_ha', 'Temperature_C', 'Humidity_%', 'pH', 'Rainfall_mm']
Categorical Features: ['State Name']
Training Data: (40612, 14)
Testing Data: (10153, 14)


20

In [None]:
# 1. Numeric Transformer: Impute -> Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 2. Categorical Transformer: Impute -> OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 3. ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 4. Full Pipeline with Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

print("‚úÖ Pipeline constructed!")


‚úÖ Pipeline constructed!


In [None]:
pipeline

In [None]:
pipeline.fit(X_train,y_train)
y_pred=pipeline.predict(X_test)
pipeline.score(X_test,y_test)
accuracy_score(y_test,y_pred)

1.0

In [None]:
set(X.columns) & {"Crop", "Yield_kg_per_ha", "Total_N_kg", "Total_P_kg", "Total_K_kg"}


set()

In [None]:
# Define grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

print("üîé Starting Grid Search...")
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

print(f"‚úÖ Best Params: {grid_search.best_params_}")
print(f"‚úÖ Best Score: {grid_search.best_score_:.4f}")


üîé Starting Grid Search...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
‚úÖ Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
‚úÖ Best Score: 1.0000


In [None]:
best_model = grid_search.best_estimator_

# Predict
y_pred = best_model.predict(X_test)

# Metrics
print(f"üèÜ Test Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save
joblib.dump(best_model, "/content/crop_recommender_pipeline.pkl")
print("‚úÖ Model saved successfully!")


üèÜ Test Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

    chickpea       1.00      1.00      1.00      2769
      cotton       1.00      1.00      1.00      1561
       maize       1.00      1.00      1.00      2827
        rice       1.00      1.00      1.00      2996

    accuracy                           1.00     10153
   macro avg       1.00      1.00      1.00     10153
weighted avg       1.00      1.00      1.00     10153

‚úÖ Model saved successfully!


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df_irri = pd.read_csv("/content/plant_vase1.csv")
df_irri.columns = df_irri.columns.str.strip()

# target: convert to 0/1
df_irri["irrgation"] = (
    df_irri["irrgation"]
    .astype(str)
    .str.lower()
    .map({"true": 1, "false": 0, "1": 1, "0": 0})
)

features = ["moisture0","moisture1","moisture2","moisture3","moisture4"]
X = df_irri[features].copy()
y = df_irri["irrgation"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from scipy.stats import randint

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
])

preprocessor = ColumnTransformer(
    transformers=[("num", num_pipe, features)]
)

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42)),
])

param_dist = {
    "clf__n_estimators": randint(50, 250),
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": randint(2, 6),
    "clf__min_samples_leaf": randint(1, 4),
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    n_jobs=-1,
    scoring="f1",
    random_state=42,
    verbose=1,
)

search.fit(X_train, y_train)

best_model = search.best_estimator_
print("Best params:", search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 4, 'clf__n_estimators': 156}


In [None]:
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

joblib.dump(best_model, "/content/irrigation_model_pipeline.pkl")


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       824

    accuracy                           1.00       824
   macro avg       1.00      1.00      1.00       824
weighted avg       1.00      1.00      1.00       824

[[824]]


['/content/irrigation_model_pipeline.pkl']