## LOADING DATASET, DOING LABEL ENCODING AND SPLITTING INTO TRAIN AND TEST DATASET

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# Load the dataset
data = pd.read_csv("Crop_Data.csv")

# Encode categorical variables

label_encoder_label = LabelEncoder()
data['label'] = label_encoder_label.fit_transform(data['label'])

label_encoder_country = LabelEncoder()
data['Country'] = label_encoder_country.fit_transform(data['Country'])

label_encoder_harvest_season = LabelEncoder()
data['harvest_season'] = label_encoder_harvest_season.fit_transform(data['harvest_season'])

# Split the data into features (X) and target variable (y)
X = data.drop(['harvest_season'], axis=1)
# X = data.drop(['harvest_season'], axis=1
y_season = data['harvest_season']

# Split the dataset into training and testing sets
X_train, X_test, y_season_train, y_season_test = train_test_split(
    X, y_season, test_size=0.2, random_state=42
)

## STANDARDIZING THE TRAIN AND TEST DATASET

In [3]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## TRAINING THE DATASET USING A SUPPORT VECTOR CLASSIFIER ALGORITHM, SAVING THE TRAINED MODEL (MODEL SERIALIZATION) AND PERFORMING EVALUATION

In [4]:
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from joblib import dump
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Create and train the model for crop harvest
model_harvest = SVC(kernel='rbf', C=1.0)
model_harvest.fit(X_train, y_season_train)

# Make predictions on the test set for harvest season
y_pred_harvest = model_harvest.predict(X_test)
accuracy_season = accuracy_score(y_season_test, y_pred_harvest)

# Precision, Recall, and F1 Score for the crop pred model
precision_harvest = precision_score(y_season_test, y_pred_harvest, average='weighted')
recall_harvest = recall_score(y_season_test, y_pred_harvest, average='weighted')
f1_harvest = f1_score(y_season_test, y_pred_harvest, average='weighted')


print(f"Model Accuracy for harvest pred: {accuracy_season}")
print(f"Precision for crop pred model: {precision_harvest}")
print(f"Recall for crop pred model: {recall_harvest}")
print(f"F1 Score for crop pred model: {f1_harvest}")



# Classification Report
print("Classification Report:")
print(classification_report(y_season_test, y_pred_harvest))
harvest_model_filename = 'harvest_prediction_model.joblib'
dump({'model': model_harvest, 'label_encoder_label': label_encoder_label, 'label_encoder_harvest_season': label_encoder_harvest_season, 'label_encoder_country': label_encoder_country}, 'harvest_prediction_model.joblib')
dump(scaler, 'standard_scaler.joblib')
print(f"Model saved as {harvest_model_filename}")
print("Standard Scaler has also been saved as joblib file")

Model Accuracy for harvest pred: 0.9107142857142857
Precision for crop pred model: 0.926433601609658
Recall for crop pred model: 0.9107142857142857
F1 Score for crop pred model: 0.9081689020217822
Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       117
           1       1.00      1.00      1.00        20
           2       1.00      0.97      0.98        62
           3       1.00      0.72      0.83        81

    accuracy                           0.91       280
   macro avg       0.96      0.92      0.93       280
weighted avg       0.93      0.91      0.91       280

Model saved as harvest_prediction_model.joblib
Standard Scaler has also been saved as joblib file


## OPTIMIZING MODEL PERFORMANCE BY DOING A GRIDSEARCH OF THE BEST HYPERPARAMETERS THAT GIVE THE HIGHEST ACCURACY SCORE

In [5]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_season_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_svm_model = grid_search.best_estimator_
best_svm_model.fit(X_train, y_season_train)

y_pred_grid = best_svm_model.predict(X_test)

accuracy_grid = accuracy_score(y_season_test, y_pred_grid)
print("Accuracy after Grid Search:", accuracy_grid)


Best Parameters: {'C': 1, 'kernel': 'rbf'}
Accuracy after Grid Search: 0.9107142857142857


#### After the best hyperparameters were gotten, we had to go back to input the value and retrain the model with the hyperparameter in this notebook to get the best model and then saved it

## PERFORMING A QUICK TEST WITH CUSTOM INPUTS TO THE MODEL TO SEE HOW OUR MODEL PERFORMS

In [6]:
# Sample input values for both label and harvest season prediction
sample_input = {
    'temperature': 25.0,
    'humidity': 75.0,
    'ph': 7.0,
    'water_availability': 250.0,
    'label': 'rice',
    'country': 'Nigeria'
}

# Use the loaded label encoders
label_encoded = label_encoder_label.transform([sample_input['label']])[0]
country_encoded = label_encoder_country.transform([sample_input['country']])[0]

# Standardize the sample input
sample_input_scaled = scaler.transform([[
    sample_input['temperature'],
    sample_input['humidity'],
    sample_input['ph'],
    sample_input['water_availability'],
    label_encoded,
    country_encoded,
]])

# Make predictions for harvest season
harvest_season_prediction = model_harvest.predict(sample_input_scaled)[0]
predicted_harvest_season = label_encoder_harvest_season.inverse_transform([harvest_season_prediction])[0]

print(f"Best_Harvest_Season: {predicted_harvest_season}")


Best_Harvest_Season: rainy




In [7]:
data["harvest_season"].nunique()

4

In [1]:
import pandas as pd
data = pd.read_csv('Crop_Data.csv')
data.head()

Unnamed: 0,temperature,humidity,ph,water availability,harvest_season,label,Country
0,20.879744,82.002744,6.502985,202.935536,rainy,rice,Nigeria
1,21.770462,80.319644,7.038096,226.655537,rainy,rice,Nigeria
2,23.004459,82.320763,7.840207,263.964248,rainy,rice,Nigeria
3,26.491096,80.158363,6.980401,242.864034,rainy,rice,Nigeria
4,20.130175,81.604873,7.628473,262.71734,rainy,rice,Nigeria


In [2]:
data.count()

temperature           1400
humidity              1400
ph                    1400
water availability    1400
harvest_season        1400
label                 1400
Country               1400
dtype: int64

In [3]:
data.columns.nunique()

7

In [4]:
data.columns.unique()

Index(['temperature', 'humidity', 'ph', 'water availability', 'harvest_season',
       'label', 'Country'],
      dtype='object')

In [5]:
data.describe()

Unnamed: 0,temperature,humidity,ph,water availability
count,1400.0,1400.0,1400.0,1400.0
mean,24.971621,64.611062,6.565246,91.784651
std,4.081622,22.753785,0.835101,58.682258
min,15.330426,14.25804,3.504752,20.211267
25%,22.178239,56.824217,6.068795,51.546542
50%,25.140245,68.288321,6.524478,72.379183
75%,27.963227,82.710409,7.042343,107.428334
max,36.977944,94.962187,9.935091,298.560117


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   temperature         1400 non-null   float64
 1   humidity            1400 non-null   float64
 2   ph                  1400 non-null   float64
 3   water availability  1400 non-null   float64
 4   harvest_season      1400 non-null   object 
 5   label               1400 non-null   object 
 6   Country             1400 non-null   object 
dtypes: float64(4), object(3)
memory usage: 76.7+ KB
