### **Load Library**

In [1]:
from google.colab import drive

# Mount Google-Drive Directory
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
import os
import kagglehub
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from google.colab import userdata
from matplotlib import pyplot as plt

In [3]:
FOLDER_PATH = userdata.get("FOLDER_PATH")
%cd $FOLDER_PATH

/content/gdrive/MyDrive/AI Models/SupervisedLearning/regression-crop-yield


### **Load Dataset**

In [4]:
# Load data from kaggle
def pandas_df(path: str, file_name: str) -> pd.DataFrame:

    path = os.path.join(path, f"{file_name}")

    return pd.read_csv(path)


In [5]:
# https://www.kaggle.com/datasets/samuelotiattakorah/agriculture-crop-yield

PATH = kagglehub.dataset_download("samuelotiattakorah/agriculture-crop-yield")
df = pandas_df(PATH, 'crop_yield.csv')
df.columns = [c.lower() for c in df.columns]
df.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/samuelotiattakorah/agriculture-crop-yield?dataset_version_number=1...


100%|██████████| 33.4M/33.4M [00:01<00:00, 18.7MB/s]

Extracting files...





Unnamed: 0,region,soil_type,crop,rainfall_mm,temperature_celsius,fertilizer_used,irrigation_used,weather_condition,days_to_harvest,yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


### **Data Overview**

In [6]:
# Column Names
print(df.columns)

Index(['region', 'soil_type', 'crop', 'rainfall_mm', 'temperature_celsius',
       'fertilizer_used', 'irrigation_used', 'weather_condition',
       'days_to_harvest', 'yield_tons_per_hectare'],
      dtype='object')


In [7]:
# Data dimension
print("Dimension: ", df.shape)

Dimension:  (1000000, 10)


In [8]:
# Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   region                  1000000 non-null  object 
 1   soil_type               1000000 non-null  object 
 2   crop                    1000000 non-null  object 
 3   rainfall_mm             1000000 non-null  float64
 4   temperature_celsius     1000000 non-null  float64
 5   fertilizer_used         1000000 non-null  bool   
 6   irrigation_used         1000000 non-null  bool   
 7   weather_condition       1000000 non-null  object 
 8   days_to_harvest         1000000 non-null  int64  
 9   yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB


In [9]:
# Missing and Duplicates
print("\nNumber of missing values")
print("-"*30)
print(df.isna().sum())

print("\nNumber of duplicated values")
print("-"*30)
print(df.duplicated().sum())


Number of missing values
------------------------------
region                    0
soil_type                 0
crop                      0
rainfall_mm               0
temperature_celsius       0
fertilizer_used           0
irrigation_used           0
weather_condition         0
days_to_harvest           0
yield_tons_per_hectare    0
dtype: int64

Number of duplicated values
------------------------------
0


In [10]:
# Data Summary: Numerical
print("*"*60)
print("Numerical Summary:".upper())
print(df.describe(include="number"))

# String/Objects
print("*"*60)
print("\nString Summary:".upper())
print(df.describe(include="object"))

# Boolean
print("*"*60)
print("\nBoolean Summary:".upper())
print(df.describe(include="boolean"))

************************************************************
NUMERICAL SUMMARY:
          rainfall_mm  temperature_celsius  days_to_harvest  \
count  1000000.000000       1000000.000000   1000000.000000   
mean       549.981901            27.504965       104.495025   
std        259.851320             7.220608        25.953412   
min        100.000896            15.000034        60.000000   
25%        324.891090            21.254502        82.000000   
50%        550.124061            27.507365       104.000000   
75%        774.738520            33.753267       127.000000   
max        999.998098            39.999997       149.000000   

       yield_tons_per_hectare  
count          1000000.000000  
mean                 4.649472  
std                  1.696572  
min                 -1.147613  
25%                  3.417637  
50%                  4.651808  
75%                  5.879200  
max                  9.963372  
************************************************************

ST

### **Data processing & cleaning**

In [15]:
# Copy Dataframe
dataframe = df.copy()

In [16]:
# Renaming columns
dataframe.rename(columns={"yield_tons_per_hectare": "crop_yield"}, inplace=True)

In [17]:
# Change feature values to lowercase (Only categorical values)
for c in dataframe.select_dtypes('object').columns:
    dataframe[c] = dataframe[c].str.lower()

# dataframe.head()

In [18]:
# Feature Engineering
dataframe['yield_success'] = dataframe["crop_yield"].apply(lambda x: "yes" if x > 0 else "no")

bool_columns = dataframe.select_dtypes("boolean").columns
for c in bool_columns:
    dataframe[c] = dataframe[c].apply(lambda s: "yes" if s else "no")

In [19]:
add_cols = ["yield_success", "crop_yield"]
column_names = dataframe.columns.tolist()

sorted_columns = [c for c in column_names if c not in add_cols] + add_cols

dataframe = dataframe[sorted_columns]
dataframe.head()

Unnamed: 0,region,soil_type,crop,rainfall_mm,temperature_celsius,fertilizer_used,irrigation_used,weather_condition,days_to_harvest,yield_success,crop_yield
0,west,sandy,cotton,897.077239,27.676966,no,yes,cloudy,122,yes,6.555816
1,south,clay,rice,992.673282,18.026142,yes,yes,rainy,140,yes,8.527341
2,north,loam,barley,147.998025,29.794042,no,no,sunny,106,yes,1.127443
3,north,sandy,soybean,986.866331,16.64419,no,yes,rainy,146,yes,6.517573
4,south,silt,wheat,730.379174,31.620687,yes,yes,cloudy,110,yes,7.248251


### **Save Processed Data**

In [137]:
%%bash
# Create processed folder only if it does not exist in the folder tree.
FILE_DIR="processed"

if [[ ! -d "$FILE_DIR" ]]; then
    echo "Create '$FILE_DIR' folder ..."
    mkdir -p processed
else
    echo "Folder '$FILE_DIR' already exists."
fi

Folder 'processed' already exists.


In [138]:
# Naming file output
FILE_NAME = "crop_yield_dataset.csv"
FILE_OUTPUT = os.path.join("processed", FILE_NAME)

# Log existing files into file_log.txt
!find . -maxdepth 2 -type f > file_log.txt

# Load existing files into a pyton list
with open("file_log.txt", "r") as f:
    list_files = [file.split("/")[-1].rstrip() for file in f.readlines()]
    print(list_files)

['notebook_profiling_clean.ipynb', 'description', 'packed-refs', 'HEAD', 'config', 'COMMIT_EDITMSG', 'index', 'FETCH_HEAD', 'ORIG_HEAD', 'README.md', '.gitignore', 'gitCLI.ipynb', 'notebook_dev.ipynb', 'notebook_analysis.ipynb', 'file_log.txt', 'crop_yield_dataset.csv']


In [139]:
if FILE_NAME not in list_files:
    print("Saving processed dataset ...")
    dataframe.to_csv(FILE_OUTPUT, index=False)
else:
    print(f"File '{FILE_OUTPUT}' already created.")

File 'processed/crop_yield_dataset.csv' already created.
