# ANALAYSIS ON ARABLE PRODUCTION OF CEREAL CROPS IN IRELAND


## A CRISP-DM approach was used in this research

### STEP 1: Importing Relevant Libraries for Data Exploration and Analysis

In [1]:
# Importing the relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
sns.set(color_codes=True)
import statistics
import warnings
warnings.filterwarnings('ignore')

### STEP 2: Loading & Exploring Data on Cereal Crops in Ireland(source: Central Statistics Office)

* Loading Data
* Cleaning and enriching Data
* Graphical Representation of the Data

#### (i.) Loading Data

In [2]:
# loading data into the data frame
df_cereals = pd.read_csv("AQA04.20221222T221238.csv")

In [3]:
# Displaying the top five rows in the data set
df_cereals.head()

Unnamed: 0,Statistic Label,Year,Type of Crop,UNIT,VALUE
0,Area under Crops,2008,Winter wheat,000 Hectares,87.5
1,Area under Crops,2008,Spring wheat,000 Hectares,23.2
2,Area under Crops,2008,Winter oats,000 Hectares,18.7
3,Area under Crops,2008,Spring oats,000 Hectares,4.2
4,Area under Crops,2008,Winter barley,000 Hectares,21.1


In [4]:
# Displaying the last five rows in the data set
df_cereals.tail()

Unnamed: 0,Statistic Label,Year,Type of Crop,UNIT,VALUE
247,Crop Production,2021,Spring wheat,000 Tonnes,52.7
248,Crop Production,2021,Winter oats,000 Tonnes,126.9
249,Crop Production,2021,Spring oats,000 Tonnes,111.4
250,Crop Production,2021,Winter barley,000 Tonnes,638.8
251,Crop Production,2021,Spring barley,000 Tonnes,917.6


In [5]:
# Displaying the info of the data set
# checking for data types because sometimes variables may be stored as string or an object
df_cereals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  252 non-null    object 
 1   Year             252 non-null    int64  
 2   Type of Crop     252 non-null    object 
 3   UNIT             252 non-null    object 
 4   VALUE            252 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 10.0+ KB


In [6]:
# Checking for missing values
print(df_cereals.isnull().sum()) 

Statistic Label    0
Year               0
Type of Crop       0
UNIT               0
VALUE              0
dtype: int64


In [7]:
# Function to Count the frequency of all values in a column
def count_values(df, column):
    counts = df[column].value_counts()
    return counts

df = df_cereals
counts = count_values(df, 'Statistic Label')
print(counts)

Area under Crops          84
Crop Yield per Hectare    84
Crop Production           84
Name: Statistic Label, dtype: int64


#### (ii.) Cleaning and enriching Data

In [8]:
# Create a pivot table with the "Statistic Label", "Year", "UNIT" columns as the index,
# the "Type of Crop" column as the columns, and the 'VALUE' column as the values
irish_cer = df_cereals.pivot_table(index=["Statistic Label","Year","UNIT"], columns="Type of Crop", values="VALUE")

In [9]:
# Loading the pivotted table
irish_cer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type of Crop,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
Statistic Label,Year,UNIT,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Area under Crops,2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
Area under Crops,2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
Area under Crops,2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
Area under Crops,2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
Area under Crops,2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6


In [10]:
# resetting the index to view & manipulate data
irish_cer.reset_index(inplace = True)
irish_cer.head()

Type of Crop,Statistic Label,Year,UNIT,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops,2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops,2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops,2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops,2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops,2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6


In [11]:
irish_cer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  42 non-null     object 
 1   Year             42 non-null     int64  
 2   UNIT             42 non-null     object 
 3   Spring barley    42 non-null     float64
 4   Spring oats      42 non-null     float64
 5   Spring wheat     42 non-null     float64
 6   Winter barley    42 non-null     float64
 7   Winter oats      42 non-null     float64
 8   Winter wheat     42 non-null     float64
dtypes: float64(6), int64(1), object(2)
memory usage: 3.1+ KB


In [12]:
# frequency of values in the "Static label" column
df = irish_cer
counts = count_values(df, 'Statistic Label')
print(counts)

Area under Crops          14
Crop Production           14
Crop Yield per Hectare    14
Name: Statistic Label, dtype: int64


In [13]:
# Replacing 'Statistic Label' values with more descriptive names
irish_cer['Statistic Label'] = irish_cer['Statistic Label'].replace({'Area under Crops': 'Area under Crops(000ha)', 
                                                                   'Crop Production': 'Crop Production(000tonnes)',
                                                                   'Crop Yield per Hectare': 'Crop Yield per ha(tonnes)'})


In [14]:
irish_cer.head()

Type of Crop,Statistic Label,Year,UNIT,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops(000ha),2008,000 Hectares,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops(000ha),2009,000 Hectares,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops(000ha),2010,000 Hectares,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops(000ha),2011,000 Hectares,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops(000ha),2012,000 Hectares,151.8,13.8,13.5,41.0,9.9,84.6


In [15]:
# Dropping the "UNIT" column because its description has been added to the "Statistic Label" column
irish_cer = irish_cer.drop(columns=["UNIT"])
irish_cer.head()

Type of Crop,Statistic Label,Year,Spring barley,Spring oats,Spring wheat,Winter barley,Winter oats,Winter wheat
0,Area under Crops(000ha),2008,166.0,4.2,23.2,21.1,18.7,87.5
1,Area under Crops(000ha),2009,174.3,11.3,20.2,19.3,9.1,64.3
2,Area under Crops(000ha),2010,146.0,9.4,18.0,28.8,10.3,59.8
3,Area under Crops(000ha),2011,144.8,12.4,16.4,35.9,9.0,77.7
4,Area under Crops(000ha),2012,151.8,13.8,13.5,41.0,9.9,84.6


#### (iii.) Graphical Representation of the Data

* Bar Graph; Bar charts are one of the most common data visualizations. You can use them to quickly compare data across categories, highlight differences, show trends and outliers, and reveal historical highs and lows at a glance.
* Line Chart; The line chart, or line graph, connects several distinct data points, presenting them as one continuous evolution. Use line charts to view trends in data, usually over time
* Heatmap; Heat Map Chart, or Heatmap is a two-dimensional visual representation of data, where values are encoded in colors, delivering a convenient, insightful view of information
* Bubble Chart; Bubble charts are super useful types of graphs for making a comparison of the relationships between data in 3 numeric-data dimensions
* https://www.tableau.com/learn/whitepapers/which-chart-or-graph-is-right-for-you
* https://www.intellspot.com/types-graphs-charts/
* https://www.anychart.com/chartopedia/chart-type/heatmap/#:~:text=Heat%20Map%20Chart%2C%20or%20Heatmap,denoting%20different%20sets%20of%20categories.

#### Bubble Chart

### STEP 3: Loading & Exploring Data on Cereal Crops Worldwide in Comparison to Ireland(source: World Bank)

#### DATA SET 1: Cereal Production in Metric Tonnes
* Load the Data set
* Data Cleaning

In [16]:
# loading data set on cereal production in metric tonnes
df_prod = pd.read_csv("API_AG.PRD.CREL.MT_DS2_en_csv_v2_4772045.csv")

In [17]:
# Displaying the head of data set 1
df_prod.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,...,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0,
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,...,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0,
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,...,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0,
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,,544000.0,543000.0,515000.0,564000.0,562000.0,...,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0,


In [18]:
# Displaying the tail of data set 1
df_prod.tail()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
261,Kosovo,XKX,Cereal production (metric tons),AG.PRD.CREL.MT,,,,,,,...,,,,,,,,,,
262,"Yemen, Rep.",YEM,Cereal production (metric tons),AG.PRD.CREL.MT,,938400.0,963100.0,978600.0,1002700.0,995900.0,...,909741.0,863334.0,699962.0,459246.0,357068.0,358355.0,344648.0,456714.0,447496.0,
263,South Africa,ZAF,Cereal production (metric tons),AG.PRD.CREL.MT,,6696635.0,7089341.0,7458814.0,5838900.0,5844000.0,...,14556168.0,14154568.0,16617349.0,11908002.0,10193885.0,18860592.0,14971803.0,13323672.0,18237226.0,
264,Zambia,ZMB,Cereal production (metric tons),AG.PRD.CREL.MT,,766608.0,748934.0,676717.0,773214.0,816214.0,...,3210649.0,2901568.0,3652301.0,2900164.0,3110868.0,3895117.0,2602943.0,2223127.0,3685484.0,
265,Zimbabwe,ZWE,Cereal production (metric tons),AG.PRD.CREL.MT,,1266453.0,1225137.0,1030933.0,1028799.0,1134571.0,...,1284472.0,1152560.0,1230995.0,800286.0,676748.0,1748001.0,1777331.0,665633.0,1598038.0,


In [19]:
# Displaying the info of the data set 1
# checking for data types because sometimes variables may be stored as string or an object
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 66 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    266 non-null    object 
 1   Country Code    266 non-null    object 
 2   Indicator Name  266 non-null    object 
 3   Indicator Code  266 non-null    object 
 4   1960            0 non-null      float64
 5   1961            192 non-null    float64
 6   1962            192 non-null    float64
 7   1963            192 non-null    float64
 8   1964            192 non-null    float64
 9   1965            192 non-null    float64
 10  1966            193 non-null    float64
 11  1967            193 non-null    float64
 12  1968            194 non-null    float64
 13  1969            194 non-null    float64
 14  1970            194 non-null    float64
 15  1971            195 non-null    float64
 16  1972            195 non-null    float64
 17  1973            195 non-null    flo

In [20]:
# Check if there are 'Not A Number' (NaN) values, also known as missing data, in the dataset columns
df_prod_missing =df_prod.isna()
df_prod_missing.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,False,False,False,False,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [21]:
# Function to check for missing values and replace them with zeros
def fill_missing_values(df_prod):
    # Check for missing values
    missing = df_prod.isnull().sum()
    
    # Replace missing values with 0
    df_prod = df_prod.fillna(0)
      
    return df_prod


In [22]:
# Fill missing values with 0
df_prod = fill_missing_values(df_prod)
df_prod.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,...,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0,0.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,...,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0,0.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,...,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0,0.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,544000.0,543000.0,515000.0,564000.0,562000.0,...,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0,0.0


In [23]:
# checking on the description of the values of the column "1960"
df_prod["1960"].describe

<bound method NDFrame.describe of 0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
261    0.0
262    0.0
263    0.0
264    0.0
265    0.0
Name: 1960, Length: 266, dtype: float64>

In [24]:
# checking on the description of the values of the column "2021"
df_prod["2021"].describe

<bound method NDFrame.describe of 0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
261    0.0
262    0.0
263    0.0
264    0.0
265    0.0
Name: 2021, Length: 266, dtype: float64>

In [25]:
# dropping the column "1960" & "2021" because they have zeroes
df_prod = df_prod.drop(columns=["1960","2021"])
df_prod.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1961,1962,1963,1964,1965,1966,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,17059517.0,17673768.0,18093367.0,16734227.0,17018761.0,17799611.0,...,69875363.0,76507660.0,80427044.0,90152808.0,79223273.0,82273939.0,94106309.0,94650120.0,90200861.0,97955319.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,3695000.0,3696000.0,3378000.0,3732000.0,3785000.0,3489000.0,...,4681020.0,6379000.0,6520329.0,6748023.0,5808288.0,5532695.0,4894365.0,4134191.0,5583461.0,6025977.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,15199359.0,16249447.0,16419357.0,16367923.0,16309479.0,14568709.0,...,51820984.0,60132437.0,57631047.0,62793664.0,66126114.0,71055876.0,70550388.0,76159885.0,77816712.0,78462572.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,544000.0,543000.0,515000.0,564000.0,562000.0,502000.0,...,1412826.0,509545.0,1674598.0,1823231.0,2019544.0,2363287.0,2496354.0,2876750.0,2920433.0,2427955.0


In [26]:
# Using Melt to arrange the data set in a chronological order
# to be able to add more columns from new data sets and retain meaning
df_prod_melt=df_prod.melt(id_vars=["Country Name","Country Code","Indicator Name","Indicator Code"],
                    var_name="Date",
                    value_name="Cereals")

In [27]:
# loading the first five rows of the melted data
df_prod_melt.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Date,Cereals
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,1961,17059517.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,1961,3695000.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,15199359.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,1961,544000.0


In [28]:
# Rename the 'Cereals' column to 'Cereals(metric tons)' so as to drop colum "Indicator Name" & "Indicator Code"
df_prod1 = df_prod_melt.rename(columns={'Cereals': 'CerealProduction(metric tons)'})
df_prod1.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Date,CerealProduction(metric tons)
0,Aruba,ABW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,0.0
1,Africa Eastern and Southern,AFE,Cereal production (metric tons),AG.PRD.CREL.MT,1961,17059517.0
2,Afghanistan,AFG,Cereal production (metric tons),AG.PRD.CREL.MT,1961,3695000.0
3,Africa Western and Central,AFW,Cereal production (metric tons),AG.PRD.CREL.MT,1961,15199359.0
4,Angola,AGO,Cereal production (metric tons),AG.PRD.CREL.MT,1961,544000.0


In [29]:
# dropping the column "Indicator Name" & "Indicator Code"
# because their description has been added to the "Cereals(metric tons)" column
# the description of the data set information(mark down) has been giving before loading the data hence
df_prodmt = df_prod1.drop(columns=["Indicator Name","Indicator Code"])
df_prodmt.head()

Unnamed: 0,Country Name,Country Code,Date,CerealProduction(metric tons)
0,Aruba,ABW,1961,0.0
1,Africa Eastern and Southern,AFE,1961,17059517.0
2,Afghanistan,AFG,1961,3695000.0
3,Africa Western and Central,AFW,1961,15199359.0
4,Angola,AGO,1961,544000.0


In [30]:
# Changing the year column from an object into a string for analysis
df_prodmt["Date"] = pd.to_numeric(df_prodmt["Date"])
df_prodmt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15960 entries, 0 to 15959
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country Name                   15960 non-null  object 
 1   Country Code                   15960 non-null  object 
 2   Date                           15960 non-null  int64  
 3   CerealProduction(metric tons)  15960 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 498.9+ KB


#### DATA SET 2: Cereal Yields Per Hectare
* Load the Data set
* Data Cleaning

In [31]:
# loading data set on cereal yields per hectare
df_yield = pd.read_csv("API_AG.YLD.CREL.KG_DS2_en_csv_v2_4772233.csv")

In [32]:
# Displaying the head of data set 2
df_yield.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,997.345265,1026.535397,1019.769151,944.857573,935.63402,...,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.66776,1830.00208,
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,1115.1,1079.0,985.8,1082.8,1098.9,...,2029.6,2048.5,2017.5,2132.2,1980.4,2023.2,2162.6,2113.4,1979.9,
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,675.354815,702.244456,698.032526,691.134187,658.343442,...,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159,
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,828.0,830.3,798.4,875.8,932.0,...,552.0,814.4,888.2,982.4,865.4,806.2,939.1,952.8,906.6,


In [33]:
# Function to check for missing values and replace them with zeros
def fill_missing_values(df_yield):
    # Check for missing values
    missing = df_yield.isnull().sum()
    
    # Replace missing values with 0
    df_yield = df_yield.fillna(0)
      
    return df_yield

In [34]:
# Fill missing values with 0
df_yield = fill_missing_values(df_yield)
df_yield.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,997.345265,1026.535397,1019.769151,944.857573,935.63402,...,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.66776,1830.00208,0.0
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,1115.1,1079.0,985.8,1082.8,1098.9,...,2029.6,2048.5,2017.5,2132.2,1980.4,2023.2,2162.6,2113.4,1979.9,0.0
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,675.354815,702.244456,698.032526,691.134187,658.343442,...,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159,0.0
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,828.0,830.3,798.4,875.8,932.0,...,552.0,814.4,888.2,982.4,865.4,806.2,939.1,952.8,906.6,0.0


In [35]:
# dropping the column "1960" & "2021" because they have zeroes
df_yield = df_yield.drop(columns=["1960","2021"])
df_yield.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1961,1962,1963,1964,1965,1966,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,997.345265,1026.535397,1019.769151,944.857573,935.63402,955.604131,...,1720.800433,1651.989944,1536.668389,1644.104399,1637.055779,1511.575987,1785.798699,1730.276067,1745.66776,1830.00208
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,1115.1,1079.0,985.8,1082.8,1098.9,1012.3,...,1659.9,2029.6,2048.5,2017.5,2132.2,1980.4,2023.2,2162.6,2113.4,1979.9
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,675.354815,702.244456,698.032526,691.134187,658.343442,643.872508,...,1063.10061,1189.862001,1113.771391,1193.819795,1238.347637,1234.519705,1202.694484,1247.204466,1280.327082,1295.738159
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,828.0,830.3,798.4,875.8,932.0,824.3,...,662.4,552.0,814.4,888.2,982.4,865.4,806.2,939.1,952.8,906.6


In [36]:
# Using Melt to arrange the data set in a chronological order
# to be able to add more columns from new data sets and retain meaning
df_yield_melt=df_yield.melt(id_vars=["Country Name","Country Code","Indicator Name","Indicator Code"],
                    var_name="Date",
                    value_name="CerealYield(Kg per ha)")

In [37]:
# dropping the column "Indicator Name" & "Indicator Code"
# because their description has been added to the "CerealYield(Kg per ha)" column
# the description of the data set information(mark down) has been giving before loading the data hence
df_yieldkg = df_yield_melt.drop(columns=["Indicator Name","Indicator Code"])
df_yieldkg.head()

Unnamed: 0,Country Name,Country Code,Date,CerealYield(Kg per ha)
0,Aruba,ABW,1961,0.0
1,Africa Eastern and Southern,AFE,1961,997.345265
2,Afghanistan,AFG,1961,1115.1
3,Africa Western and Central,AFW,1961,675.354815
4,Angola,AGO,1961,828.0


In [38]:
# Changing the year column from an object into a string for analysis
df_yieldkg["Date"] = pd.to_numeric(df_yieldkg["Date"])
df_yieldkg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15960 entries, 0 to 15959
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country Name            15960 non-null  object 
 1   Country Code            15960 non-null  object 
 2   Date                    15960 non-null  int64  
 3   CerealYield(Kg per ha)  15960 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 498.9+ KB


#### DATA SET 3: Land Under Cereal Production
* Load the Data set
* Data Cleaning

In [39]:
# loading data set on land under cereal production
df_land = pd.read_csv("API_AG.LND.CREL.HA_DS2_en_csv_v2_4772643.csv")

In [40]:
# Displaying the head of data set 3
df_land.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Land under cereal production (hectares),AG.LND.CREL.HA,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Land under cereal production (hectares),AG.LND.CREL.HA,,17104771.0,17217085.0,17742374.0,17711146.0,18189682.0,...,46312728.0,52338380.0,54834082.0,48393034.0,54429019.0,52696861.0,54702114.0,51671232.0,53527497.0,
2,Afghanistan,AFG,Land under cereal production (hectares),AG.LND.CREL.HA,,3313500.0,3425500.0,3426500.0,3446500.0,3444500.0,...,3143000.0,3182922.0,3344733.0,2724070.0,2793694.0,2419097.0,1911658.0,2641911.0,3043589.0,
3,Africa Western and Central,AFW,Land under cereal production (hectares),AG.LND.CREL.HA,,22506121.0,23139749.0,23521897.0,23682246.0,24774354.0,...,50537475.0,51743944.0,52599401.0,53398425.0,57557405.0,58661008.0,61064789.0,60778932.0,60554797.0,
4,Angola,AGO,Land under cereal production (hectares),AG.LND.CREL.HA,,657000.0,654000.0,645000.0,644000.0,603000.0,...,923085.0,2056204.0,2052732.0,2055726.0,2730899.0,3096317.0,3063255.0,3064970.0,2678045.0,


In [41]:
# Function to check for missing values and replace them with zeros
def fill_missing_values(df_land):
    # Check for missing values
    missing = df_land.isnull().sum()
    
    # Replace missing values with 0
    df_land = df_land.fillna(0)
      
    return df_land

In [42]:
# Fill missing values with 0
df_land = fill_missing_values(df_land)
df_land.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,17104771.0,17217085.0,17742374.0,17711146.0,18189682.0,...,46312728.0,52338380.0,54834082.0,48393034.0,54429019.0,52696861.0,54702114.0,51671232.0,53527497.0,0.0
2,Afghanistan,AFG,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,3313500.0,3425500.0,3426500.0,3446500.0,3444500.0,...,3143000.0,3182922.0,3344733.0,2724070.0,2793694.0,2419097.0,1911658.0,2641911.0,3043589.0,0.0
3,Africa Western and Central,AFW,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,22506121.0,23139749.0,23521897.0,23682246.0,24774354.0,...,50537475.0,51743944.0,52599401.0,53398425.0,57557405.0,58661008.0,61064789.0,60778932.0,60554797.0,0.0
4,Angola,AGO,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,657000.0,654000.0,645000.0,644000.0,603000.0,...,923085.0,2056204.0,2052732.0,2055726.0,2730899.0,3096317.0,3063255.0,3064970.0,2678045.0,0.0


In [43]:
# dropping the column "1960" & "2021" to balance the years with the data set 1 and data set 2 when merging
df_land = df_land.drop(columns=["1960","2021"])
df_land.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1961,1962,1963,1964,1965,1966,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Land under cereal production (hectares),AG.LND.CREL.HA,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Africa Eastern and Southern,AFE,Land under cereal production (hectares),AG.LND.CREL.HA,17104771.0,17217085.0,17742374.0,17711146.0,18189682.0,18626794.0,...,40606548.0,46312728.0,52338380.0,54834082.0,48393034.0,54429019.0,52696861.0,54702114.0,51671232.0,53527497.0
2,Afghanistan,AFG,Land under cereal production (hectares),AG.LND.CREL.HA,3313500.0,3425500.0,3426500.0,3446500.0,3444500.0,3446500.0,...,2820000.0,3143000.0,3182922.0,3344733.0,2724070.0,2793694.0,2419097.0,1911658.0,2641911.0,3043589.0
3,Africa Western and Central,AFW,Land under cereal production (hectares),AG.LND.CREL.HA,22506121.0,23139749.0,23521897.0,23682246.0,24774354.0,22626585.0,...,48745766.0,50537475.0,51743944.0,52599401.0,53398425.0,57557405.0,58661008.0,61064789.0,60778932.0,60554797.0
4,Angola,AGO,Land under cereal production (hectares),AG.LND.CREL.HA,657000.0,654000.0,645000.0,644000.0,603000.0,609000.0,...,2132989.0,923085.0,2056204.0,2052732.0,2055726.0,2730899.0,3096317.0,3063255.0,3064970.0,2678045.0


In [44]:
# Using Melt to arrange the data set in a chronological order
# to be able to add more columns from new data sets and retain meaning
df_land_melt=df_land.melt(id_vars=["Country Name","Country Code","Indicator Name","Indicator Code"],
                    var_name="Date",
                    value_name="LandUnderCereal(ha)")

In [45]:
# dropping the column "Indicator Name" & "Indicator Code"
# because their description has been added to the "LandUnderCereal(ha)" column
# the description of the data set information(mark down) has been giving before loading the data hence
df_landha = df_land_melt.drop(columns=["Indicator Name","Indicator Code"])
df_landha.head()

Unnamed: 0,Country Name,Country Code,Date,LandUnderCereal(ha)
0,Aruba,ABW,1961,0.0
1,Africa Eastern and Southern,AFE,1961,17104771.0
2,Afghanistan,AFG,1961,3313500.0
3,Africa Western and Central,AFW,1961,22506121.0
4,Angola,AGO,1961,657000.0


In [46]:
# Changing the year column from an object into a string for analysis
df_landha["Date"] = pd.to_numeric(df_landha["Date"])
df_landha.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15960 entries, 0 to 15959
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country Name         15960 non-null  object 
 1   Country Code         15960 non-null  object 
 2   Date                 15960 non-null  int64  
 3   LandUnderCereal(ha)  15960 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 498.9+ KB


### Merging Data Set 1, Data Set 2 & Data Set 3 to one Data set(World Data on Arable Production of Cereals)

In [47]:
# Adding the columns "CerealYield(Kg per ha)","LandUnderCereal(ha)" from df_yieldkg & df_landha to df_prodmt
df_prodmt["CerealYield(Kg per ha)"] = df_yieldkg["CerealYield(Kg per ha)"]
df_prodmt["LandUnderCereal(ha)"] = df_landha["LandUnderCereal(ha)"]

# Saving the resulting data set to a new CSV file
df_prodmt.to_csv("arable-production-of-cereal-crops", index=False)


In [48]:
df_arable = df_prodmt
df_arable

Unnamed: 0,Country Name,Country Code,Date,CerealProduction(metric tons),CerealYield(Kg per ha),LandUnderCereal(ha)
0,Aruba,ABW,1961,0.0,0.000000,0.0
1,Africa Eastern and Southern,AFE,1961,17059517.0,997.345265,17104771.0
2,Afghanistan,AFG,1961,3695000.0,1115.100000,3313500.0
3,Africa Western and Central,AFW,1961,15199359.0,675.354815,22506121.0
4,Angola,AGO,1961,544000.0,828.000000,657000.0
...,...,...,...,...,...,...
15955,Kosovo,XKX,2020,0.0,0.000000,0.0
15956,"Yemen, Rep.",YEM,2020,447496.0,833.900000,536626.0
15957,South Africa,ZAF,2020,18237226.0,5407.200000,3372793.0
15958,Zambia,ZMB,2020,3685484.0,2479.600000,1486294.0
