In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import joblib

In [2]:
# read data from csv
filename = '../../dataset/crops_dataset/yield_df.csv'
yield_dataframe = pd.read_csv(filename)
#check data 
#print(yield_dataframe.head())
# check coulmns
print(yield_dataframe.columns)
# checking unique values for each column
print(yield_dataframe['Item'].unique())
print(yield_dataframe['Item'].value_counts())

yield_dataframe['Item']= yield_dataframe['Item'].replace('Rice, paddy','Rice')
# making copy of dataframe
dataframe = yield_dataframe.copy()

Index(['Unnamed: 0', 'Area', 'Item', 'Year', 'hg/ha_yield',
       'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp'],
      dtype='object')
['Maize' 'Potatoes' 'Rice, paddy' 'Sorghum' 'Soybeans' 'Wheat' 'Cassava'
 'Sweet potatoes' 'Plantains and others' 'Yams']
Potatoes                4276
Maize                   4121
Wheat                   3857
Rice, paddy             3388
Soybeans                3223
Sorghum                 3039
Sweet potatoes          2890
Cassava                 2045
Yams                     847
Plantains and others     556
Name: Item, dtype: int64


In [3]:
## we are going to make a general model so the countries from which the plantation 
## was made doesn't matter
## dataset cleaning
#dataframe.drop(columns="Unnamed: 0",inplace=True)
print(dataframe.columns)
##deleting duplicates
# dataframe.drop_duplicates(inplace=True)
## checking null values
#print(dataframe.isnull().sum())
## checking datatypes
print(dataframe.dtypes)
# transforming datatypes for some field like ( Items )
item_frequency = dataframe['Item'].value_counts().to_dict()
dataframe['crop_frequency'] = dataframe['Item'].map(item_frequency)

Index(['Unnamed: 0', 'Area', 'Item', 'Year', 'hg/ha_yield',
       'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp'],
      dtype='object')
Unnamed: 0                         int64
Area                              object
Item                              object
Year                               int64
hg/ha_yield                        int64
average_rain_fall_mm_per_year    float64
pesticides_tonnes                float64
avg_temp                         float64
dtype: object


In [4]:
dataframe.drop(columns=["Year","Area","Unnamed: 0"],inplace=True)
print(dataframe["Item"].value_counts())
print(dataframe.columns)
# print(dataframe.head())
#dataframe.corr(numeric_only=True)

Potatoes                4276
Maize                   4121
Wheat                   3857
Rice                    3388
Soybeans                3223
Sorghum                 3039
Sweet potatoes          2890
Cassava                 2045
Yams                     847
Plantains and others     556
Name: Item, dtype: int64
Index(['Item', 'hg/ha_yield', 'average_rain_fall_mm_per_year',
       'pesticides_tonnes', 'avg_temp', 'crop_frequency'],
      dtype='object')


In [5]:
## scaling data
'''
    we are going to use standardization for feature scaling 
    but because scaling the model on the whole dataset can cause the model to over fit and data leakage
    so let us first split data into training and testing data
'''
## splitting data
X = dataframe[["crop_frequency","pesticides_tonnes","avg_temp","average_rain_fall_mm_per_year"]]
Y = dataframe["hg/ha_yield"] 

'''
    the thing is that we are saying we will train our model that give x_train value the results will be y_train
    then to test the model we will ask the model that what would be the output when given this x_test values 
    and then later the it will evaluates its self of the y_test values
'''

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=43)

'''
Fit:

The scaler computes the mean and standard deviation of the training data.
Transform:
The scaler applies the transformation to the data using the formula
The transform method applies a precomputed transformation to new data. It does not recompute statistics (e.g., mean,standard deviation); instead, it uses the statistics already computed during the fit step

'''
# scaling
scaler =  StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# model training
model = RandomForestRegressor(n_estimators=100, random_state=43)
model.fit(x_train_scaled,y_train)

# predicting 
y_pred = model.predict(x_test_scaled)

# model evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error: 5696.93
R² Score: 0.97


In [6]:
# saving
# save the frequency mapping
joblib.dump(item_frequency, 'crop_frequency_mapping.pkl')
#save the scaler 
joblib.dump(scaler, 'scaler.pkl')
# save the model
joblib.dump(model, 'crop_yield_model.pkl')


['crop_yield_model.pkl']