In [2]:
# Google colab link for the report: https://colab.research.google.com/drive/1T9ZKrF6FeLyR1dkLdHS-dyFvR-rxl46F?usp=sharing
# url = 'https://github.com/SeivenBell/Data_science_tools/blob/main/Assignment_2/yield_prediction.csv'

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data_path = "yield_prediction.csv"
data = pd.read_csv(data_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,id,water,uv,area,fertilizer_usage,yield,pesticides,region,categories
0,0,0.072,80.179,9.414,0,29.878,2.231,6,c
1,1,5.413,58.359,9.681,3,53.416,1.81,6,c
2,2,9.731,78.506,7.189,1,63.391,2.455,1,d
3,3,10.995,69.248,1.738,3,17.984,0.603,2,a
4,4,2.617,87.658,9.706,1,49.768,2.91,6,c


In [3]:
# Drop the id column as it's just an identifier and likely not useful for prediction.

# Drop the 'id' column
data.drop("id", axis=1, inplace=True)

# Check for missing values
missing_data = data.isnull().sum()

missing_data

water               42
uv                   0
area                 0
fertilizer_usage     0
yield                0
pesticides           0
region               0
categories           0
dtype: int64

The dataset has 42 missing values in the water column. We'll handle these missing values by imputing them with the median value of the water column, as this is a robust method that is less affected by outliers.

After addressing the missing data, we will identify and handle outliers. We'll focus on the numerical features for outlier detection and handling, using the Interquartile Range (IQR) method to identify outliers.
https://www.analyticsvidhya.com/blog/2021/05/detecting-and-treating-outliers-treating-the-odd-one-out/

In [4]:
# Impute missing values in 'water' column with its median value
water_median = data["water"].median()
# fillna() function to replace all the null or NaN values in the 'water' column with the calculated median.
data["water"].fillna(water_median, inplace=True)


# Identify and handle outliers using IQR method for numerical columns
numerical_cols = ["water", "uv", "area", "fertilizer_usage", "yield", "pesticides"]

for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the outliers
    data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)
    # The clip() function is then used to replace all values below the lower bound with the lower
    # bound value, and all values above the upper bound with the upper bound value. This is known
    # as 'capping' the outliers

# Confirm if the missing values are handled
missing_data_after = data.isnull().sum()

missing_data_after, data.describe()

(water               0
 uv                  0
 area                0
 fertilizer_usage    0
 yield               0
 pesticides          0
 region              0
 categories          0
 dtype: int64,
              water           uv         area  fertilizer_usage        yield  \
 count  1000.000000  1000.000000  1000.000000       1000.000000  1000.000000   
 mean      6.655903    73.943218     8.092453          2.294000    58.595117   
 std       2.768134     9.996096     2.668264          1.554986    24.073466   
 min       0.072000    45.320125     0.892125          0.000000     2.843000   
 25%       4.695500    66.493000     6.297000          1.000000    40.698000   
 50%       6.476000    73.700000     7.987500          3.000000    55.602500   
 75%       8.611000    80.608250     9.900250          3.000000    73.645500   
 max      14.484250   101.781125    15.305125          5.000000   123.066750   
 
         pesticides       region  
 count  1000.000000  1000.000000  
 mean    

In [5]:
# Split the data into features and target
X = data.drop("yield", axis=1)
# creating a new DataFrame X that includes all columns from data except for the "yield" column
y = data["yield"]

# Encode the 'categories' column as it's categorical
X = pd.get_dummies(X, columns=["categories"], drop_first=True)

# First, split into training and temp (temporary set for further splitting into validation and test sets)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Now, split the temp set equally into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Check the sizes of each set to confirm the split
sizes = {
    "Training Set": len(X_train),
    "Validation Set": len(X_val),
    "Test Set": len(X_test),
}

sizes

# Now, use X_train and y_train to train a linear regression model
# and X_val, y_val, X_test, and y_test to evaluate the model.

{'Training Set': 700, 'Validation Set': 150, 'Test Set': 150}