Dummy Data With 5000 Entries (CSV File)

In [None]:
import pandas as pd
import random
import numpy as np

# Soil types and multiple crops
soil_types = ['Clay', 'Sandy', 'Loamy', 'Black', 'Red']
crop_choices = {
    'Clay': ['Wheat', 'Barley', 'Mustard', 'Lentils'],
    'Sandy': ['Maize', 'Peanut', 'Watermelon', 'Cucumber'],
    'Loamy': ['Rice', 'Sugarcane', 'Potato', 'Tomato'],
    'Black': ['Cotton', 'Soybean', 'Sunflower', 'Chickpeas'],
    'Red': ['Millet', 'Pulses', 'Sorghum', 'Groundnut']
}

data = []

for _ in range(5000):
    rainfall = round(random.uniform(200, 1200), 2)
    temperature = round(random.uniform(15, 40), 2)
    soil = random.choice(soil_types)
    land = round(random.uniform(0.5, 10), 2)

    # Yield logic
    base_yield = (rainfall * 0.01) + (temperature * 0.3) + (land * 2)
    noise = np.random.normal(0, 3)  # Random noise
    yield_tons = round(base_yield + noise, 2)

    crop = random.choice(crop_choices[soil])

    data.append([rainfall, temperature, soil, land, yield_tons, crop])

# Create DataFrame
df = pd.DataFrame(data, columns=['rainfall', 'temperature', 'soil_type', 'land_hectares', 'yield_tons', 'crop'])

# Save to CSV
df.to_csv('agri_dataset.csv', index=False)

df.head()


Unnamed: 0,rainfall,temperature,soil_type,land_hectares,yield_tons,crop
0,419.02,33.59,Sandy,7.81,28.09,Peanut
1,753.81,31.39,Clay,2.67,24.24,Lentils
2,825.97,36.15,Loamy,5.23,35.95,Tomato
3,682.27,39.66,Loamy,5.27,30.9,Potato
4,411.2,32.26,Red,1.52,18.24,Groundnut


Installing Requirements

In [None]:
!pip install pandas scikit-learn joblib




Data Preprocessing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("agri_dataset.csv")

# Encode soil type and crop
le_soil = LabelEncoder()
df['soil_type_encoded'] = le_soil.fit_transform(df['soil_type'])

le_crop = LabelEncoder()
df['crop_encoded'] = le_crop.fit_transform(df['crop'])

df.head()


Unnamed: 0,rainfall,temperature,soil_type,land_hectares,yield_tons,crop,soil_type_encoded,crop_encoded
0,419.02,33.59,Sandy,7.81,28.09,Peanut,4,9
1,753.81,31.39,Clay,2.67,24.24,Lentils,1,5
2,825.97,36.15,Loamy,5.23,35.95,Tomato,2,17
3,682.27,39.66,Loamy,5.27,30.9,Potato,2,10
4,411.2,32.26,Red,1.52,18.24,Groundnut,3,4


Training Regrssion Model (Randam Forest Regressor is used)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import numpy as np

# Features & target
X_yield = df[['rainfall', 'temperature', 'soil_type_encoded', 'land_hectares']]
y_yield = df['yield_tons']

# Split data
X_train_y, X_test_y, y_train_y, y_test_y = train_test_split(X_yield, y_yield, test_size=0.2, random_state=42)

# Train model
model_yield = RandomForestRegressor()
model_yield.fit(X_train_y, y_train_y)

# Evaluate
preds = model_yield.predict(X_test_y)
# Calculate RMSE manually using NumPy if 'squared' argument is unavailable
rmse = np.sqrt(mean_squared_error(y_test_y, preds))
print("Yield Model RMSE:", rmse)

# Save model
joblib.dump(model_yield, "yield_model.pkl")

Yield Model RMSE: 3.218494840469686


['yield_model.pkl']

Training Classification Model (Randam Forest Classifier is used)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Features & target
X_crop = df[['rainfall', 'temperature', 'soil_type_encoded']]
y_crop = df['crop_encoded']

# Split data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_crop, y_crop, test_size=0.2, random_state=42)

# Train model
model_crop = RandomForestClassifier()
model_crop.fit(X_train_c, y_train_c)

# Evaluate
preds_crop = model_crop.predict(X_test_c)
print("Crop Model Accuracy:", accuracy_score(y_test_c, preds_crop))

# Save model & label encoder
joblib.dump(model_crop, "crop_model.pkl")
joblib.dump(le_crop, "crop_label_encoder.pkl")
joblib.dump(le_soil, "soil_label_encoder.pkl")


Crop Model Accuracy: 0.253


['soil_label_encoder.pkl']

Final One

In [2]:
# Install required packages
!pip install pandas scikit-learn joblib

# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import joblib

# STEP 3: Load dataset
df = pd.read_csv('/content/agri_dataset.csv')

# Encode soil_type and crop
soil_encoder = LabelEncoder()
df['soil_encoded'] = soil_encoder.fit_transform(df['soil_type'])

crop_encoder = LabelEncoder()
df['crop_encoded'] = crop_encoder.fit_transform(df['crop'])

# Features and targets
features = ['rainfall', 'temperature', 'soil_encoded', 'land_hectares']

# For Yield Prediction
X_yield = df[features]
y_yield = df['yield_tons']

# For Crop Classification
X_crop = df[features]
y_crop = df['crop_encoded']

# Train-test split
X_yield_train, X_yield_test, y_yield_train, y_yield_test = train_test_split(X_yield, y_yield, test_size=0.2, random_state=42)
X_crop_train, X_crop_test, y_crop_train, y_crop_test = train_test_split(X_crop, y_crop, test_size=0.2, random_state=42)

# Train Models
yield_model = RandomForestRegressor(n_estimators=100, random_state=42)
yield_model.fit(X_yield_train, y_yield_train)

crop_model = RandomForestClassifier(n_estimators=100, random_state=42)
crop_model.fit(X_crop_train, y_crop_train)

# Save models and encoders
joblib.dump(yield_model, 'yield_model.pkl')
joblib.dump(crop_model, 'crop_model.pkl')
joblib.dump(soil_encoder, 'soil_label_encoder.pkl')
joblib.dump(crop_encoder, 'crop_label_encoder.pkl')

print(" All models and encoders saved successfully!")


 All models and encoders saved successfully!
