### Importing libraries and data

In [1]:
# Import libraries

import pandas as pd
import numpy as np

In [2]:
# Import Data

df = pd.read_csv(filepath_or_buffer="../data/crop_yield_train.csv")

In [3]:
df.head()

Unnamed: 0,id,soil_ph,soil_moisture,avg_temperature,total_rainfall,fertilizer_amount,pesticide_usage,sunlight_hours,nitrogen_content,phosphorus_content,potassium_content,irrigation_frequency,crop_type,region,season,harvest_date,field_id,yield_tpha
0,0,7.831719,37.192725,32.270297,145.627849,124.830027,11.13249,1911.116978,1.445159,0.698938,1.074339,3,Soybean,South,Autumn,2021-03-09,F0138,3.790277
1,1,6.685905,23.715684,25.852201,599.005355,120.168428,11.846171,2011.488102,0.525983,1.137722,1.718454,3,Soybean,South,Spring,2021-07-18,F0393,5.660778
2,2,8.338307,20.4814,18.202587,333.247698,270.799112,10.588497,1929.725597,2.996184,1.07968,1.386601,5,Soybean,North,Summer,2021-11-20,F0066,7.098251
3,3,7.21468,42.446504,13.758647,523.610747,99.013588,1.222238,2231.228584,2.680929,0.764238,0.814746,5,Corn,East,Summer,2021-03-02,F0150,5.461535
4,4,5.129093,16.614817,14.444958,1005.931705,169.955045,6.3501,2826.831668,1.298629,1.487599,0.910354,3,Corn,Central,Summer,2021-10-24,F0498,6.336988


In [4]:
df.tail()

Unnamed: 0,id,soil_ph,soil_moisture,avg_temperature,total_rainfall,fertilizer_amount,pesticide_usage,sunlight_hours,nitrogen_content,phosphorus_content,potassium_content,irrigation_frequency,crop_type,region,season,harvest_date,field_id,yield_tpha
4795,4795,5.875831,17.025059,15.661515,878.536336,158.700801,3.257848,1285.428969,1.113319,0.202119,0.766489,6,Barley,East,Summer,2021-12-23,F0223,6.546929
4796,4796,7.303234,16.138273,27.927987,273.478409,151.216406,3.962554,1333.150567,1.817108,1.491216,0.666138,5,Wheat,Central,Summer,2021-10-01,F0091,7.003236
4797,4797,6.833229,38.105084,24.207628,575.764272,184.379434,13.942368,2009.721557,0.975856,0.856878,0.586865,2,Barley,North,Summer,2021-08-16,F0164,5.834223
4798,4798,5.057801,10.438546,28.840547,946.630937,109.656487,14.144132,1155.118005,1.542474,0.446792,0.908985,5,Corn,North,Autumn,2021-11-15,F0391,4.761456
4799,4799,7.60211,12.876269,18.356159,402.219536,75.920073,2.463546,1883.719248,1.293796,0.488368,1.297505,1,Rice,South,Autumn,2021-10-16,F0021,4.0929


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4800 non-null   int64  
 1   soil_ph               4800 non-null   float64
 2   soil_moisture         4800 non-null   float64
 3   avg_temperature       4800 non-null   float64
 4   total_rainfall        4800 non-null   float64
 5   fertilizer_amount     4800 non-null   float64
 6   pesticide_usage       4800 non-null   float64
 7   sunlight_hours        4800 non-null   float64
 8   nitrogen_content      4800 non-null   float64
 9   phosphorus_content    4800 non-null   float64
 10  potassium_content     4800 non-null   float64
 11  irrigation_frequency  4800 non-null   int64  
 12  crop_type             4800 non-null   object 
 13  region                4800 non-null   object 
 14  season                4800 non-null   object 
 15  harvest_date         

* object data types has categorical values, except the harvest date field.
* We will need convert the object type categgorical values into numerical
* We will also convert the object type harvest date to pandas datetime

### Data Cleaning

In [6]:
# checking null values

In [7]:
# no null values
df.isna().sum(axis=0)

id                      0
soil_ph                 0
soil_moisture           0
avg_temperature         0
total_rainfall          0
fertilizer_amount       0
pesticide_usage         0
sunlight_hours          0
nitrogen_content        0
phosphorus_content      0
potassium_content       0
irrigation_frequency    0
crop_type               0
region                  0
season                  0
harvest_date            0
field_id                0
yield_tpha              0
dtype: int64

* There is no null values

### Data processing

In [8]:
# object values are categorical data
# we need to encode these categorical data into numerical values
# Datetime values need to chaange it to date time format, not keep in object type

In [9]:
# convert the datetime object format to pandas datetime

In [10]:
df["harvest_date"] = pd.to_datetime(df["harvest_date"], format = "%Y-%m-%d")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    4800 non-null   int64         
 1   soil_ph               4800 non-null   float64       
 2   soil_moisture         4800 non-null   float64       
 3   avg_temperature       4800 non-null   float64       
 4   total_rainfall        4800 non-null   float64       
 5   fertilizer_amount     4800 non-null   float64       
 6   pesticide_usage       4800 non-null   float64       
 7   sunlight_hours        4800 non-null   float64       
 8   nitrogen_content      4800 non-null   float64       
 9   phosphorus_content    4800 non-null   float64       
 10  potassium_content     4800 non-null   float64       
 11  irrigation_frequency  4800 non-null   int64         
 12  crop_type             4800 non-null   object        
 13  region            

* harvest date is now in datetime format

In [12]:
def encodecategorical(data):
    
    # get all the columns with object data type
    allcols = data.columns.to_list()
    objectcols = []
    allunq_mapper = []
    for col in allcols:
        if data[col].dtype=="O":
            objectcols.append(col)

    # count the total unique object value and convert them to numeric
    # map data to df
    for col in objectcols:
        unq_mapper = {unq:unq_id+1 for unq_id, unq in enumerate(df[col].unique())}
        data[col] = data[col].map(unq_mapper)
        allunq_mapper.append(unq_mapper)

    return (allunq_mapper, data)

In [13]:
map_dict, data_update = encodecategorical(df)

In [14]:
data_update

Unnamed: 0,id,soil_ph,soil_moisture,avg_temperature,total_rainfall,fertilizer_amount,pesticide_usage,sunlight_hours,nitrogen_content,phosphorus_content,potassium_content,irrigation_frequency,crop_type,region,season,harvest_date,field_id,yield_tpha
0,0,7.831719,37.192725,32.270297,145.627849,124.830027,11.132490,1911.116978,1.445159,0.698938,1.074339,3,1,1,1,2021-03-09,1,3.790277
1,1,6.685905,23.715684,25.852201,599.005355,120.168428,11.846171,2011.488102,0.525983,1.137722,1.718454,3,1,1,2,2021-07-18,2,5.660778
2,2,8.338307,20.481400,18.202587,333.247698,270.799112,10.588497,1929.725597,2.996184,1.079680,1.386601,5,1,2,3,2021-11-20,3,7.098251
3,3,7.214680,42.446504,13.758647,523.610747,99.013588,1.222238,2231.228584,2.680929,0.764238,0.814746,5,2,3,3,2021-03-02,4,5.461535
4,4,5.129093,16.614817,14.444958,1005.931705,169.955045,6.350100,2826.831668,1.298629,1.487599,0.910354,3,2,4,3,2021-10-24,5,6.336988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,4795,5.875831,17.025059,15.661515,878.536336,158.700801,3.257848,1285.428969,1.113319,0.202119,0.766489,6,3,3,3,2021-12-23,210,6.546929
4796,4796,7.303234,16.138273,27.927987,273.478409,151.216406,3.962554,1333.150567,1.817108,1.491216,0.666138,5,4,4,3,2021-10-01,243,7.003236
4797,4797,6.833229,38.105084,24.207628,575.764272,184.379434,13.942368,2009.721557,0.975856,0.856878,0.586865,2,3,2,3,2021-08-16,360,5.834223
4798,4798,5.057801,10.438546,28.840547,946.630937,109.656487,14.144132,1155.118005,1.542474,0.446792,0.908985,5,2,2,1,2021-11-15,180,4.761456
