# ANALAYSIS ON ARABLE PRODUCTION OF CEREAL CROPS IN IRELAND


## A CRISP-DM approach was used in this research

### STEP 1: Importing Relevant Libraries for Data Exploration and Analysis

In [1]:
# Importing the relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
sns.set(color_codes=True)
import statistics
import warnings
warnings.filterwarnings('ignore')

### STEP 2: Loading & Exploring Data on Cereal Crops in Ireland(source: Central Statistics Office)

In [2]:
# loading data into the data frame
df_cereals = pd.read_csv("AQA04.20221222T221238.csv")

In [3]:
# Displaying the top five rows in the data set
df_cereals.head()

Unnamed: 0,Statistic Label,Year,Type of Crop,UNIT,VALUE
0,Area under Crops,2008,Winter wheat,000 Hectares,87.5
1,Area under Crops,2008,Spring wheat,000 Hectares,23.2
2,Area under Crops,2008,Winter oats,000 Hectares,18.7
3,Area under Crops,2008,Spring oats,000 Hectares,4.2
4,Area under Crops,2008,Winter barley,000 Hectares,21.1


In [4]:
# Displaying the last five rows in the data set
df_cereals.tail()

Unnamed: 0,Statistic Label,Year,Type of Crop,UNIT,VALUE
247,Crop Production,2021,Spring wheat,000 Tonnes,52.7
248,Crop Production,2021,Winter oats,000 Tonnes,126.9
249,Crop Production,2021,Spring oats,000 Tonnes,111.4
250,Crop Production,2021,Winter barley,000 Tonnes,638.8
251,Crop Production,2021,Spring barley,000 Tonnes,917.6


In [5]:
# Displaying the info of the data set
# checking for data types because sometimes variables may be stored as string or an object
df_cereals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  252 non-null    object 
 1   Year             252 non-null    int64  
 2   Type of Crop     252 non-null    object 
 3   UNIT             252 non-null    object 
 4   VALUE            252 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 10.0+ KB


In [6]:
# Checking for missing values
print(df_cereals.isnull().sum()) 

Statistic Label    0
Year               0
Type of Crop       0
UNIT               0
VALUE              0
dtype: int64


In [7]:
# Function to Count the frequency of all values in a column
def count_values(df, column):
    counts = df[column].value_counts()
    return counts

df = df_cereals
counts = count_values(df, 'Statistic Label')
print(counts)


Area under Crops          84
Crop Yield per Hectare    84
Crop Production           84
Name: Statistic Label, dtype: int64


In [8]:
# Create a pivot table with the "Statistic Label", "Year", "UNIT" columns as the index,
# the "Type of Crop" column as the columns, and the 'VALUE' column as the values
irish_cer = df_cereals.pivot_table(index=["Statistic Label","Year","UNIT"], columns="Type of Crop", values="VALUE")



In [9]:
# Loading the pivotted table
irish_cer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type of Crop,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
Statistic Label,Year,UNIT,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Area under Crops,2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
Area under Crops,2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
Area under Crops,2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
Area under Crops,2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
Area under Crops,2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6


In [10]:
# resetting the index to view & manipulate data
irish_cer.reset_index(inplace = True)
irish_cer

Type of Crop,Statistic Label,Year,UNIT,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops,2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops,2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops,2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops,2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops,2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6
5,Area under Crops,2013,000 Hectares,183.5,21.3,15.2,36.0,5.4,45.4
6,Area under Crops,2014,000 Hectares,155.6,8.5,6.5,60.1,10.1,65.1
7,Area under Crops,2015,000 Hectares,133.0,12.1,10.1,69.8,11.4,55.3
8,Area under Crops,2016,000 Hectares,114.6,10.0,7.5,74.6,13.2,60.4
9,Area under Crops,2017,000 Hectares,115.2,10.0,6.8,65.0,14.4,60.3


In [11]:
irish_cer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  42 non-null     object 
 1   Year             42 non-null     int64  
 2   UNIT             42 non-null     object 
 3   Spring barley    42 non-null     float64
 4   Spring oats      42 non-null     float64
 5   Spring wheat     42 non-null     float64
 6   Winter barley    42 non-null     float64
 7   Winter oats      42 non-null     float64
 8   Winter wheat     42 non-null     float64
dtypes: float64(6), int64(1), object(2)
memory usage: 3.1+ KB


In [12]:
# frequency of values in the "Static label" column
df = irish_cer
counts = count_values(df, 'Statistic Label')
print(counts)

Area under Crops          14
Crop Production           14
Crop Yield per Hectare    14
Name: Statistic Label, dtype: int64


In [13]:
# Replacing 'Statistic Label' values with more descriptive names
irish_cer['Statistic Label'] = irish_cer['Statistic Label'].replace({'Area under Crops': 'Area under Crops(000ha)', 
                                                                   'Crop Production': 'Crop Production(000tonnes)',
                                                                   'Crop Yield per Hectare': 'Crop Yield per ha(tonnes)'})


In [14]:
irish_cer

Type of Crop,Statistic Label,Year,UNIT,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops(000ha),2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops(000ha),2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops(000ha),2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops(000ha),2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops(000ha),2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6
5,Area under Crops(000ha),2013,000 Hectares,183.5,21.3,15.2,36.0,5.4,45.4
6,Area under Crops(000ha),2014,000 Hectares,155.6,8.5,6.5,60.1,10.1,65.1
7,Area under Crops(000ha),2015,000 Hectares,133.0,12.1,10.1,69.8,11.4,55.3
8,Area under Crops(000ha),2016,000 Hectares,114.6,10.0,7.5,74.6,13.2,60.4
9,Area under Crops(000ha),2017,000 Hectares,115.2,10.0,6.8,65.0,14.4,60.3


In [15]:
# Dropping the "UNIT" column because its description has been added to the "Statistic Label" column
irish_cer = irish_cer.drop(columns=["UNIT"])
irish_cer

Type of Crop,Statistic Label,Year,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops(000ha),2008,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops(000ha),2009,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops(000ha),2010,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops(000ha),2011,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops(000ha),2012,151.8,13.8,13.5,41.0,9.9,84.6
5,Area under Crops(000ha),2013,183.5,21.3,15.2,36.0,5.4,45.4
6,Area under Crops(000ha),2014,155.6,8.5,6.5,60.1,10.1,65.1
7,Area under Crops(000ha),2015,133.0,12.1,10.1,69.8,11.4,55.3
8,Area under Crops(000ha),2016,114.6,10.0,7.5,74.6,13.2,60.4
9,Area under Crops(000ha),2017,115.2,10.0,6.8,65.0,14.4,60.3


### STEP 3: Loading & Exploring Data on Cereal Crops Worldwide in Comparison to Ireland(source: World Bank)

#### DATA SET 1: Cereal Production in Metric Tonnes
* Load the Data set
* Data Cleaning

In [16]:
# loading data set on cereal production in metric tonnes
df_prod = pd.read_csv("API_AG.PRD.CREL.MT_DS2_en_csv_v2_4772045.csv")

In [17]:
# Displaying the head of data set 1
df_prod.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,...,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0,
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,...,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0,
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,...,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0,
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,,544000.0,543000.0,515000.0,564000.0,562000.0,...,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0,


In [18]:
# Displaying the tail of data set 1
df_prod.tail()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
261,Kosovo,XKX,Cereal production (metric tons),AG.PRD.CREL.MT,,,,,,,...,,,,,,,,,,
262,"Yemen, Rep.",YEM,Cereal production (metric tons),AG.PRD.CREL.MT,,938400.0,963100.0,978600.0,1002700.0,995900.0,...,909741.0,863334.0,699962.0,459246.0,357068.0,358355.0,344648.0,456714.0,447496.0,
263,South Africa,ZAF,Cereal production (metric tons),AG.PRD.CREL.MT,,6696635.0,7089341.0,7458814.0,5838900.0,5844000.0,...,14556168.0,14154568.0,16617349.0,11908002.0,10193885.0,18860592.0,14971803.0,13323672.0,18237226.0,
264,Zambia,ZMB,Cereal production (metric tons),AG.PRD.CREL.MT,,766608.0,748934.0,676717.0,773214.0,816214.0,...,3210649.0,2901568.0,3652301.0,2900164.0,3110868.0,3895117.0,2602943.0,2223127.0,3685484.0,
265,Zimbabwe,ZWE,Cereal production (metric tons),AG.PRD.CREL.MT,,1266453.0,1225137.0,1030933.0,1028799.0,1134571.0,...,1284472.0,1152560.0,1230995.0,800286.0,676748.0,1748001.0,1777331.0,665633.0,1598038.0,


In [19]:
# Displaying the info of the data set 1
# checking for data types because sometimes variables may be stored as string or an object
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 66 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    266 non-null    object 
 1   Country Code    266 non-null    object 
 2   Indicator Name  266 non-null    object 
 3   Indicator Code  266 non-null    object 
 4   1960            0 non-null      float64
 5   1961            192 non-null    float64
 6   1962            192 non-null    float64
 7   1963            192 non-null    float64
 8   1964            192 non-null    float64
 9   1965            192 non-null    float64
 10  1966            193 non-null    float64
 11  1967            193 non-null    float64
 12  1968            194 non-null    float64
 13  1969            194 non-null    float64
 14  1970            194 non-null    float64
 15  1971            195 non-null    float64
 16  1972            195 non-null    float64
 17  1973            195 non-null    flo

In [20]:
# Check if there are 'Not A Number' (NaN) values, also known as missing data, in the dataset columns
df_prod_missing =df_prod.isna()
df_prod_missing.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,False,False,False,False,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [21]:
# Function to check for missing values and replace them with zeros
def fill_missing_values(df_prod):
    # Check for missing values
    missing = df_prod.isnull().sum()
    
    # Replace missing values with 0
    df_prod = df_prod.fillna(0)
      
    return df_prod


In [22]:
# Fill missing values with 0
df_prod = fill_missing_values(df_prod)
df_prod

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,...,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0,0.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,...,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0,0.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,...,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0,0.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,544000.0,543000.0,515000.0,564000.0,562000.0,...,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,"Yemen, Rep.",YEM,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,938400.0,963100.0,978600.0,1002700.0,995900.0,...,909741.0,863334.0,699962.0,459246.0,357068.0,358355.0,344648.0,456714.0,447496.0,0.0
263,South Africa,ZAF,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,6696635.0,7089341.0,7458814.0,5838900.0,5844000.0,...,14556168.0,14154568.0,16617349.0,11908002.0,10193885.0,18860592.0,14971803.0,13323672.0,18237226.0,0.0
264,Zambia,ZMB,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,766608.0,748934.0,676717.0,773214.0,816214.0,...,3210649.0,2901568.0,3652301.0,2900164.0,3110868.0,3895117.0,2602943.0,2223127.0,3685484.0,0.0


In [23]:
# checking on the description of the values of the column "1960"
df_prod["1960"].describe

<bound method NDFrame.describe of 0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
261    0.0
262    0.0
263    0.0
264    0.0
265    0.0
Name: 1960, Length: 266, dtype: float64>

In [24]:
# checking on the description of the values of the column "2021"
df_prod["2021"].describe

<bound method NDFrame.describe of 0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
261    0.0
262    0.0
263    0.0
264    0.0
265    0.0
Name: 2021, Length: 266, dtype: float64>

In [25]:
# dropping the column "1960" & "2021" because they have zeroes
df_prod = df_prod.drop(columns=["1960","2021"])
df_prod

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1961,1962,1963,1964,1965,1966,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,17799611.0,...,69875363.0,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,3489000.0,...,4681020.0,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,14568709.0,...,51820984.0,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,544000.0,543000.0,515000.0,564000.0,562000.0,502000.0,...,1412826.0,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,"Yemen, Rep.",YEM,Cereal production (metric tons),AG.PRD.CREL.MT,938400.0,963100.0,978600.0,1002700.0,995900.0,918300.0,...,816555.0,909741.0,863334.0,699962.0,459246.0,357068.0,358355.0,344648.0,456714.0,447496.0
263,South Africa,ZAF,Cereal production (metric tons),AG.PRD.CREL.MT,6696635.0,7089341.0,7458814.0,5838900.0,5844000.0,6183711.0,...,12928365.0,14556168.0,14154568.0,16617349.0,11908002.0,10193885.0,18860592.0,14971803.0,13323672.0,18237226.0
264,Zambia,ZMB,Cereal production (metric tons),AG.PRD.CREL.MT,766608.0,748934.0,676717.0,773214.0,816214.0,892081.0,...,3376059.0,3210649.0,2901568.0,3652301.0,2900164.0,3110868.0,3895117.0,2602943.0,2223127.0,3685484.0


In [26]:
# Using Melt to arrange the data set in a chronological order
# to be able to add more columns from new data sets and retain meaning
df_prod_melt=df_prod.melt(id_vars=["Country Name","Country Code","Indicator Name","Indicator Code"],
                    var_name="Date",
                    value_name="Cereals")

In [27]:
# loading the first five rows of the melted data
df_prod_melt.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Date,Cereals
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,1961,17059517.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,1961,3695000.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,15199359.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,1961,544000.0


In [28]:
# checking on the data types of after using the melt function
df_prod_melt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15960 entries, 0 to 15959
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    15960 non-null  object 
 1   Country Code    15960 non-null  object 
 2   Indicator Name  15960 non-null  object 
 3   Indicator Code  15960 non-null  object 
 4   Date            15960 non-null  object 
 5   Cereals         15960 non-null  float64
dtypes: float64(1), object(5)
memory usage: 748.2+ KB


In [29]:
# Rename the 'Cereals' column to 'Cereals(metric tons)' so as to drop colum "Indicator Name" & "Indicator Code"
df_prod1 = df_prod_melt.rename(columns={'Cereals': 'CerealProduction(metric tons)'})
df_prod1

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Date,CerealProduction(metric tons)
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,1961,17059517.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,1961,3695000.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,15199359.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,1961,544000.0
...,...,...,...,...,...,...
15955,Kosovo,XKX,Cereal production (metric tons),AG.PRD.CREL.MT,2020,0.0
15956,"Yemen, Rep.",YEM,Cereal production (metric tons),AG.PRD.CREL.MT,2020,447496.0
15957,South Africa,ZAF,Cereal production (metric tons),AG.PRD.CREL.MT,2020,18237226.0
15958,Zambia,ZMB,Cereal production (metric tons),AG.PRD.CREL.MT,2020,3685484.0


In [30]:
# dropping the column "Indicator Name" & "Indicator Code"
# because its description has been added to the "Cereals(metric tons)" column
# the description of the data set has been giving before loading the data hence no requirement for "Indicator code"
df_prodmt = df_prod1.drop(columns=["Indicator Name","Indicator Code"])
df_prodmt

Unnamed: 0,Country Name,Country Code,Date,CerealProduction(metric tons)
0,Aruba,ABW,1961,0.0
1,Africa Eastern and Southern,AFE,1961,17059517.0
2,Afghanistan,AFG,1961,3695000.0
3,Africa Western and Central,AFW,1961,15199359.0
4,Angola,AGO,1961,544000.0
...,...,...,...,...
15955,Kosovo,XKX,2020,0.0
15956,"Yemen, Rep.",YEM,2020,447496.0
15957,South Africa,ZAF,2020,18237226.0
15958,Zambia,ZMB,2020,3685484.0


In [31]:
# Changing the year column from an object into a string for analysis
df_prodmt["Date"] = pd.to_numeric(df_prodmt["Date"])
df_prodmt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15960 entries, 0 to 15959
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country Name                   15960 non-null  object 
 1   Country Code                   15960 non-null  object 
 2   Date                           15960 non-null  int64  
 3   CerealProduction(metric tons)  15960 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 498.9+ KB


#### DATA SET 2: Cereal Yields Per Hectare
* Load the Data set
* Data Cleaning

In [32]:
# loading data set on cereal yields per hectare
df_yield = pd.read_csv("API_AG.YLD.CREL.KG_DS2_en_csv_v2_4772233.csv")

In [33]:
# Displaying the head of data set 2
df_yield.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,997.345265,1026.535397,1019.769151,944.857573,935.63402,...,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.66776,1830.00208,
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,1115.1,1079.0,985.8,1082.8,1098.9,...,2029.6,2048.5,2017.5,2132.2,1980.4,2023.2,2162.6,2113.4,1979.9,
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,675.354815,702.244456,698.032526,691.134187,658.343442,...,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159,
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,828.0,830.3,798.4,875.8,932.0,...,552.0,814.4,888.2,982.4,865.4,806.2,939.1,952.8,906.6,


In [34]:
# Displaying the tail of data set 2
df_yield.tail()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
261,Kosovo,XKX,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,,,,,,...,,,,,,,,,,
262,"Yemen, Rep.",YEM,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,782.5,780.7,771.8,776.1,773.6,...,1064.4,1008.2,962.7,784.2,687.0,699.0,682.8,864.9,833.9,
263,South Africa,ZAF,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,1099.1,1142.1,1128.0,913.9,911.4,...,4243.5,4043.0,4899.6,3541.6,3821.2,5702.0,4938.8,4316.6,5407.2,
264,Zambia,ZMB,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,822.2,801.4,706.9,788.9,823.5,...,2708.8,2552.7,2774.9,3026.4,2432.2,2489.9,2168.2,2397.7,2479.6,
265,Zimbabwe,ZWE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,919.7,905.9,822.5,820.5,930.8,...,695.7,668.5,831.4,557.5,435.1,1202.7,1242.2,1173.7,1075.2,


In [35]:
# Check if there are 'Not A Number' (NaN) values, also known as missing data, in the dataset columns
df_yield_missing =df_yield.isna()
df_yield_missing.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,False,False,False,False,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [36]:
# Function to check for missing values and replace them with zeros
def fill_missing_values(df_yield):
    # Check for missing values
    missing = df_yield.isnull().sum()
    
    # Replace missing values with 0
    df_yield = df_yield.fillna(0)
      
    return df_yield

In [37]:
# Fill missing values with 0
df_yield = fill_missing_values(df_yield)
df_yield

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,997.345265,1026.535397,1019.769151,944.857573,935.634020,...,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.667760,1830.002080,0.0
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,1115.100000,1079.000000,985.800000,1082.800000,1098.900000,...,2029.600000,2048.500000,2017.500000,2132.200000,1980.400000,2023.200000,2162.600000,2113.400000,1979.900000,0.0
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,675.354815,702.244456,698.032526,691.134187,658.343442,...,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159,0.0
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,828.000000,830.300000,798.400000,875.800000,932.000000,...,552.000000,814.400000,888.200000,982.400000,865.400000,806.200000,939.100000,952.800000,906.600000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
262,"Yemen, Rep.",YEM,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,782.500000,780.700000,771.800000,776.100000,773.600000,...,1064.400000,1008.200000,962.700000,784.200000,687.000000,699.000000,682.800000,864.900000,833.900000,0.0
263,South Africa,ZAF,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,1099.100000,1142.100000,1128.000000,913.900000,911.400000,...,4243.500000,4043.000000,4899.600000,3541.600000,3821.200000,5702.000000,4938.800000,4316.600000,5407.200000,0.0
264,Zambia,ZMB,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,822.200000,801.400000,706.900000,788.900000,823.500000,...,2708.800000,2552.700000,2774.900000,3026.400000,2432.200000,2489.900000,2168.200000,2397.700000,2479.600000,0.0


In [38]:
# dropping the column "1960" & "2021" because they have zeroes
df_yield = df_yield.drop(columns=["1960","2021"])
df_yield.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1961,1962,1963,1964,1965,1966,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,997.345265,1026.535397,1019.769151,944.857573,935.63402,955.604131,...,1720.800433,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.66776,1830.00208
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,1115.1,1079.0,985.8,1082.8,1098.9,1012.3,...,1659.9,2029.6,2048.5,2017.5,2132.2,1980.4,2023.2,2162.6,2113.4,1979.9
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,675.354815,702.244456,698.032526,691.134187,658.343442,643.872508,...,1063.10061,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,828.0,830.3,798.4,875.8,932.0,824.3,...,662.4,552.0,814.4,888.2,982.4,865.4,806.2,939.1,952.8,906.6
