# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import joblib

In [2]:
yield_data = pd.read_csv("yield_df.csv")

In [3]:
yield_data.head()

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [4]:
yield_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28242 entries, 0 to 28241
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     28242 non-null  int64  
 1   Area                           28242 non-null  object 
 2   Item                           28242 non-null  object 
 3   Year                           28242 non-null  int64  
 4   hg/ha_yield                    28242 non-null  int64  
 5   average_rain_fall_mm_per_year  28242 non-null  float64
 6   pesticides_tonnes              28242 non-null  float64
 7   avg_temp                       28242 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 1.7+ MB


In [5]:
yield_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
yield_data.columns

Index(['Area', 'Item', 'Year', 'hg/ha_yield', 'average_rain_fall_mm_per_year',
       'pesticides_tonnes', 'avg_temp'],
      dtype='object')

In [7]:
yield_data.rename({'Area':'Country', 
                   'Item':'Crop',
                   'hg/ha_yield': 'Crop_yield(hg/ha)',
                   'average_rain_fall_mm_per_year': 'Average_annual_rainfall(mm)',
                   'pesticides_tonnes':'Pesticides_used(tonnes)',
                   'avg_temp':'Average_temperature(degree C)'}, axis=1, inplace=True)

columns = ['Country', 'Crop', 'Average_annual_rainfall(mm)', 'Pesticides_used(tonnes)', 'Average_temperature(degree C)', 'Crop_yield(hg/ha)']
yield_data = yield_data[columns]

yield_data.head()

Unnamed: 0,Country,Crop,Average_annual_rainfall(mm),Pesticides_used(tonnes),Average_temperature(degree C),Crop_yield(hg/ha)
0,Albania,Maize,1485.0,121.0,16.37,36613
1,Albania,Potatoes,1485.0,121.0,16.37,66667
2,Albania,"Rice, paddy",1485.0,121.0,16.37,23333
3,Albania,Sorghum,1485.0,121.0,16.37,12500
4,Albania,Soybeans,1485.0,121.0,16.37,7000


In [8]:
yield_data

Unnamed: 0,Country,Crop,Average_annual_rainfall(mm),Pesticides_used(tonnes),Average_temperature(degree C),Crop_yield(hg/ha)
0,Albania,Maize,1485.0,121.00,16.37,36613
1,Albania,Potatoes,1485.0,121.00,16.37,66667
2,Albania,"Rice, paddy",1485.0,121.00,16.37,23333
3,Albania,Sorghum,1485.0,121.00,16.37,12500
4,Albania,Soybeans,1485.0,121.00,16.37,7000
...,...,...,...,...,...,...
28237,Zimbabwe,"Rice, paddy",657.0,2550.07,19.76,22581
28238,Zimbabwe,Sorghum,657.0,2550.07,19.76,3066
28239,Zimbabwe,Soybeans,657.0,2550.07,19.76,13142
28240,Zimbabwe,Sweet potatoes,657.0,2550.07,19.76,22222


### Creating a count column for each crop

In [9]:
yield_data['Crop'].value_counts()

Crop
Potatoes                4276
Maize                   4121
Wheat                   3857
Rice, paddy             3388
Soybeans                3223
Sorghum                 3039
Sweet potatoes          2890
Cassava                 2045
Yams                     847
Plantains and others     556
Name: count, dtype: int64

In [10]:
X = yield_data.iloc[:, :-1]
y = yield_data.iloc[:, -1]

### Encoding the Independent Variables

In [11]:
X.head()

Unnamed: 0,Country,Crop,Average_annual_rainfall(mm),Pesticides_used(tonnes),Average_temperature(degree C)
0,Albania,Maize,1485.0,121.0,16.37
1,Albania,Potatoes,1485.0,121.0,16.37
2,Albania,"Rice, paddy",1485.0,121.0,16.37
3,Albania,Sorghum,1485.0,121.0,16.37
4,Albania,Soybeans,1485.0,121.0,16.37


In [12]:
le_co = LabelEncoder()
X['Country'] = le_co.fit_transform(X['Country'])

le_cr = LabelEncoder()
X['Crop'] = le_cr.fit_transform(X['Crop'])

X.head()

Unnamed: 0,Country,Crop,Average_annual_rainfall(mm),Pesticides_used(tonnes),Average_temperature(degree C)
0,0,1,1485.0,121.0,16.37
1,0,3,1485.0,121.0,16.37
2,0,4,1485.0,121.0,16.37
3,0,5,1485.0,121.0,16.37
4,0,6,1485.0,121.0,16.37


# Model Testing And Evaluation

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
rfc_reg = RandomForestRegressor()
rfc_reg.fit(X_train, y_train)

# y_pred_rfc_ = rfc_reg.predict(X_train)
y_pred_rfc = rfc_reg.predict(X_test)

# print(f"Training set score: {r2_score(y_train, y_pred_rfc_)}\n")
print(f"Test set score: {r2_score(y_test, y_pred_rfc)}")

# print(mean_absolute_error(y_train, y_pred_rfc_))
print(mean_absolute_error(y_test, y_pred_rfc))

Test set score: 0.9729016587627226
5654.610969820732


In [15]:
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

y_pred_xgb = xgb_reg.predict(X_test)

print(r2_score(y_test, y_pred_xgb))
print(mean_absolute_error(y_test, y_pred_xgb))

0.9659671783447266
8730.2119140625


## Re-training the Model on the entire dataset

In [16]:
rfc_reg.fit(X, y)
# xgb_reg.fit(X, y)

In [17]:
print(rfc_reg.score(X, y))

0.9964024587486324


# Saving the Model 

In [18]:
joblib.dump(rfc_reg, 'rfc_model_c.joblib',
     compress=('lzma', 3))

# with open("xgb_yield_pred.joblib", "wb") as model_file:
#     joblib.dump(xgb_reg, model_file)

with open("country_le", "wb") as f:
    joblib.dump(le_co, f)

with open("crop_le", "wb") as f:
    joblib.dump(le_cr, f)

### Functions For 'app.py'

In [19]:
output = output = pd.DataFrame(columns=yield_data.columns, data=yield_data)
output.to_csv('new_yield_df.csv', index=False)

- Push to hub

In [20]:
from huggingface_hub import HfApi, login
login("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")

api = HfApi()

api.upload_file(
    path_or_fileobj="rfc_model_c.joblib",
    path_in_repo="rfc_model_c.joblib",
    repo_id = "Sad1m/agric_advisor",
    repo_type="model",
    commit_message="Update RFC model after retraining",
    # num_threads=1,
    # timeout=3600
)

rfc_model_c.joblib:   0%|          | 0.00/25.5M [00:00<?, ?B/s]

'(ProtocolError('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None)), '(Request ID: 00c98159-3883-45c0-9714-a20b829172ea)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/35/b1/35b102fc0c5394b1aa6ded26cb6d6825be16767ba6452517b4a28d3d3f3ae5ee/89b0cf99d754aba1240c78cb076ad3e6774ecf15c25d2807e7925ee361288d6e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250724%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250724T041809Z&X-Amz-Expires=86400&X-Amz-Signature=ca590aaa6627d07260ff16c781794ac625a7785a79a9d07c354a6e6dbec4ab93&X-Amz-SignedHeaders=host&partNumber=1&uploadId=M7xpPYwdkK73Or0NBGXb4dNxsDCLrihtSBpkgyyPAsJVz81epZz3yBfncTrwF_1sVoAds00WsbTDaEl0ApXkwajR_Z9Xm0Vj85pH.XhnZHhlStIHDjjRFWLNFZUjwJ5q&x-id=UploadPart
Retrying in 1s [Retry 1/5].


CommitInfo(commit_url='https://huggingface.co/Sad1m/agric_advisor/commit/badc63ac30e3d7d1d444d35ead9c151285636a38', commit_message='Update RFC model after retraining', commit_description='', oid='badc63ac30e3d7d1d444d35ead9c151285636a38', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sad1m/agric_advisor', endpoint='https://huggingface.co', repo_type='model', repo_id='Sad1m/agric_advisor'), pr_revision=None, pr_num=None)