### Task 1

#### Visualisation and Analytics of Geospatial Data

In [1]:
#importing the necessary python libraries

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
#reading the cereal dataset into cereal dataframe

cereal = pd.read_csv('API_AG.YLD.CREL.KG_DS2_en_csv_v2_5734359.csv', skiprows= 4)
cereal.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,924.111737,931.506715,934.015106,879.750864,869.584522,...,1636.00973,1616.362162,1490.807738,1764.116707,1728.295922,1717.894885,1838.762607,1840.899744,,
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,1115.1,1079.0,985.8,1082.8,1098.9,...,2017.5,2132.2,1980.4,2022.5,2162.0,2113.4,1979.9,2154.7,,
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,675.354816,702.244456,698.032526,691.134187,658.343442,...,1226.442299,1268.162021,1314.142638,1287.72092,1334.53572,1343.46279,1381.643141,1341.959411,,
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,828.0,830.3,798.4,875.8,932.0,...,888.2,982.4,865.4,806.2,941.4,958.8,992.5,1000.3,,


##### Data cleaning

In [4]:
# getting the columns from 1960 to 2022 in a list

columns = cereal.columns[4:-1]

In [5]:
columns

Index(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
      dtype='object')

In [6]:
# fill the nan with the previous column value for columns 1960 to 2022. This is done to reduce large variations in the yield data

for i in columns:
    cereal[i].fillna(method='ffill', inplace=True)

cereal.head()

  cereal[i].fillna(method='ffill', inplace=True)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,924.111737,931.506715,934.015106,879.750864,869.584522,...,1636.00973,1616.362162,1490.807738,1764.116707,1728.295922,1717.894885,1838.762607,1840.899744,,
2,Afghanistan,AFG,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,1115.1,1079.0,985.8,1082.8,1098.9,...,2017.5,2132.2,1980.4,2022.5,2162.0,2113.4,1979.9,2154.7,,
3,Africa Western and Central,AFW,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,675.354816,702.244456,698.032526,691.134187,658.343442,...,1226.442299,1268.162021,1314.142638,1287.72092,1334.53572,1343.46279,1381.643141,1341.959411,,
4,Angola,AGO,Cereal yield (kg per hectare),AG.YLD.CREL.KG,,828.0,830.3,798.4,875.8,932.0,...,888.2,982.4,865.4,806.2,941.4,958.8,992.5,1000.3,,


In [7]:
# checking null values

cereal[columns].isna().sum()

Unnamed: 0,0
1960,266
1961,1
1962,1
1963,1
1964,1
...,...
2018,1
2019,1
2020,1
2021,1


In [8]:
# finding the columns with full nan value (total rows in cereal is 266)

a=cereal[columns].isna().sum()
print(a[a==266].index)

Index(['1960', '2022'], dtype='object')


In [9]:
# drop columns not useful

cereal.drop(['1960','2022','Indicator Name', 'Indicator Code','Unnamed: 67'],axis=1,inplace=True)

In [10]:
cereal.columns

Index(['Country Name', 'Country Code', '1961', '1962', '1963', '1964', '1965',
       '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974',
       '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021'],
      dtype='object')

In [11]:
# remove 1960 and 2022 from the columns list

columns= columns[1:-1]

In [12]:
# finding the country with no cereal yield value for all years

cereal[cereal.isna().any(axis=1)]['Country Name']

Unnamed: 0,Country Name
0,Aruba


In [13]:
#removing the country with no cereal yield value

cereal =  cereal[cereal['Country Name'] != 'Aruba']

In [14]:
# checking for null values in the dataframe

cereal[columns].isna().sum()

Unnamed: 0,0
1961,0
1962,0
1963,0
1964,0
1965,0
...,...
2017,0
2018,0
2019,0
2020,0


In [15]:
#importing the geopandas dataset 'naturalearth_lowres' for geometry

loc = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

  loc = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [16]:
loc.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,889953.0,Oceania,Fiji,FJI,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,58005463.0,Africa,Tanzania,TZA,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,37589262.0,North America,Canada,CAN,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,328239523.0,North America,United States of America,USA,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [17]:
# rename Country Code column to iso_a3 to merge cereal and loc dataframe to create a geopandas dataframe

cereal.rename(columns={'Country Code':'iso_a3'},inplace=True)

In [18]:
# drop columns not used in loc

loc.drop(['pop_est', 'continent', 'name','gdp_md_est'], axis=1, inplace=True)

In [19]:
# merge cereal and loc dataframe

cereal_geo = pd.merge(loc,cereal, on='iso_a3')

In [20]:
# Create choropleth map for 2019

fig = px.choropleth(cereal_geo,
                          geojson=cereal_geo['Country Name'],
                          locations = cereal_geo['iso_a3'],
                          color='2019',
                          hover_name='Country Name',
                          custom_data=[cereal_geo['Country Name'],cereal_geo['2019']],
                          title='Cereal Yield (kg/ha) - 2019',
                          color_continuous_scale='sunset')

fig.update_geos(fitbounds="locations", visible=False)

fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Cereal yield: %{customdata[1]:.2f}<extra></extra>')

fig.show()

In [21]:
# Create choropleth map for 2020
fig = px.choropleth(cereal_geo,
                          geojson=cereal_geo['Country Name'],
                          locations = cereal_geo['iso_a3'],
                          color='2020',
                          hover_name='Country Name',
                          custom_data=[cereal_geo['Country Name'],cereal_geo['2020']],
                          title='Cereal Yield (kg/ha) - 2020',
                          color_continuous_scale='sunset')

fig.update_geos(fitbounds="locations", visible=False)

fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Cereal yield: %{customdata[1]:.2f}<extra></extra>')

fig.show()

According to data on grain yields for 2019 and 2020, Oman and the United Arab Emirates are major producers of cereals. Their yields of cereals have changed, meanwhile, with Oman reporting an increase of 3000 kg/ha and the United Arab Emirates suffering a decrease of about 2000 kg/ha. This change is a reflection of how dynamic agricultural output is and is subject to the effect of a number of variables, including climate, farming methods, and policy.

The data also reveals geographical differences in cereal yield, with Greenland and the African continent reporting the lowest cereal yields. Geospatial analysts may use these insights to better understand how cereal production patterns are changing and the causes behind these changes. This information can help them make decisions about trade, agricultural development, and food security.

### Task 1.2

In [24]:
#reading the population dataset into pop dataframe

pop = pd.read_csv('API_SP.POP.TOTL_DS2_en_csv_v2_4485025.csv', skiprows=4)
pop.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,...,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0,107195.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,...,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0,694665117.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,...,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0,39835428.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,...,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0,470898870.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,...,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0,33933611.0,


#### Data Cleaning

In [25]:
# getting the columns from 1960 to 2022 in a list and checking for null values

col = pop.columns[4:-1]
pop[col].isna().sum()

Unnamed: 0,0
1960,2
1961,2
1962,2
1963,2
1964,2
...,...
2017,2
2018,2
2019,2
2020,2


In [26]:
# fill the nan with the previous column value for columns 1960 to 2021. This is done to reduce large variations in the population data

for i in col:
    pop[i].fillna(method='ffill', inplace=True)

pop.head()


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,...,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0,107195.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,...,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0,694665117.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,...,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0,39835428.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,...,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0,470898870.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,...,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0,33933611.0,


In [27]:
# checking for null values

pop[col].isna().sum()

Unnamed: 0,0
1960,0
1961,0
1962,0
1963,0
1964,0
...,...
2017,0
2018,0
2019,0
2020,0


In [28]:
# remove columns that are not useful

pop.drop(['Indicator Name', 'Indicator Code','Unnamed: 66'],axis=1,inplace=True)

In [29]:
# rename Country Code column to iso_a3 for using the pop dataset with cereal dataset for analysis

pop.rename(columns={'Country Code':'iso_a3'},inplace=True)

In [30]:
# change the Country name in pop to that in the cereal using iso_a3, this is done to remove misinterpretations during analysis

iso_a3_to_country = dict(zip(cereal['iso_a3'], cereal['Country Name']))
pop['Country Name'] = pop['iso_a3'].map(iso_a3_to_country).combine_first(pop['Country Name'])

In [31]:
# changing the crs to EPSG 3857 for plotting maps

cereal_geo = cereal_geo.to_crs('EPSG: 3857')

### Task 1.2.1:

For the year 2021, generate choropleth maps of cereal yield for only the countries having a population less than or equal to 67326569.

In [32]:
# identifying the countries in cereal with popluation <= 67326569 in 2021 using iso_a3

a = cereal_geo[cereal_geo['iso_a3'].isin(pop.loc[pop['2021']<= 67326569,'iso_a3'])]

In [53]:
# Create choropleth map to plot the identified countries in dataframe a

fig = px.choropleth(a,
                    geojson=a['Country Name'],
                    locations=a['iso_a3'],
                    color='2021',
                    hover_name='Country Name',
                    custom_data=[a['Country Name'],a['2021']],
                    title='Cereal Yield in 2021 for Countries with Population <= 60000000',
                    color_continuous_scale='ylorbr',
                    projection= 'natural earth')

fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Cereal yield: %{customdata[1]:.2f}<extra></extra>')

fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white", visible=False)

fig.show()

For countries having a population less than or equal to 67326569 in 2021, the plot illustrates that United Arab Emirates (26226.2 kg/ha), Oman (16461.4 kg/ha) have exceptionally high cereal yields, suggesting advanced agricultural practices, the presence of favorable climate conditions for certain crops.

Niger (349.6 kg/ha): Niger has the lowest cereal yield, likely due to challenges such as arid climate conditions, limited access to modern farming techniques, and socio-economic factors affecting agricultural productivity.

The high cereal yield in the UAE and Oman is a result of a combination of strategic investments, favorable environmental conditions, technological advancements, and government policies that prioritize and support the agricultural sector.

### Task 1.2.2:

For the year 2021, generate choropleth maps of cereal yield for only the countries having a population greater than or equal to 331893745.


In [34]:
# identifying the countries in cereal with popluation >= 331893745 in 2021 using iso_a3

b = cereal_geo[cereal_geo['iso_a3'].isin(pop.loc[pop['2021']>= 331893745,'iso_a3'])]

In [52]:
# Create choropleth map to plot the identified countries in dataframe b

fig = px.choropleth(b,
                    geojson=b['Country Name'],
                    locations=b['iso_a3'],
                    color='2021',
                    hover_name='Country Name',
                    custom_data=[b['Country Name'],b['2021']],
                    title='Cereal Yield in 2021 for Countries with Population <= 300000000',
                    color_continuous_scale='ylorbr',
                    projection= 'natural earth')

fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Cereal yield: %{customdata[1]:.2f}<extra></extra>')

fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white", visible=False)

# Show the interactive map
fig.show()

For countries having a population greater than or equal to 331893745 in 2021, the plot illustrates:

the United States exhibits a high cereal yield, reflecting its advanced agricultural sector.

India has a moderate cereal yield, influenced by a combination of factors. The country has a diverse agricultural landscape, with variations in climate, soil types, and farming practices.

Factors such as population pressure, small landholdings, and varying levels of technological adoption contribute to the moderate yield.

China demonstrates a high cereal yield, reflecting its commitment to modernizing agriculture and increasing productivity.

The Chinese government has implemented policies and invested in technology, infrastructure, and research to enhance agricultural efficiency.

Eritrea has a lower cereal yield compared to the other countries. This may be attributed to challenges such as arid or semi-arid conditions, limited access to modern agricultural practices, and potential socio-economic factors impacting agricultural productivity.

### Task 1.2.3:

For the year 2021, generate choropleth maps of cereal yield for only the countries having a population between 10269022 and 1393409034.

In [54]:
# identifying the countries in cereal with popluation between 10269022 and 1393409034 in 2021 using iso_a3

c = cereal_geo[cereal_geo['iso_a3'].isin(pop.loc[pop['2021'].between(10000000,1000000000),'iso_a3'])]

In [56]:
# Create choropleth map to plot the identified countries in dataframe c

fig = px.choropleth(c,
                    geojson=c['Country Name'],
                    locations=c['iso_a3'],
                    color='2021',
                    hover_name='Country Name',
                    custom_data=[c['Country Name'],c['2021']],
                    title='Cereal Yield in 2021 for Countries with Population between 10000000 and 1000000000',
                    color_continuous_scale='ylorbr',
                    projection= 'natural earth')

fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Cereal yield: %{customdata[1]:.2f}<extra></extra>')

fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white", visible=False)

# Show the interactive map
fig.show()

For countries having a population between 10269022 and 1393409034 in 2021, the plot shows that there is significant variability in cereal yield, ranging from a few hundred to several thousand kg/ha.

Some countries, such as the United States, China, and several European nations, have relatively high cereal yields (e.g., above 5,000 kg/ha).

Other countries, particularly in Africa and parts of Asia, exhibit lower cereal yields (e.g., below 2,000 kg/ha).

Countries with higher cereal yields often invest heavily in agricultural research and technology. The United States, Netherlands, and France, for example, have advanced agricultural practices, modern machinery, and access to cutting-edge technologies that contribute to increased productivity.

In some regions, traditional subsistence farming practices may prevail, focusing on meeting immediate food needs rather than maximizing yields.

Niger is located in the Sahel region, which is susceptible to climate variability and extreme weather events. Erratic rainfall patterns and prolonged droughts can negatively impact crop production.

### Task 1.2.4

Plot (scatter or line plot) the percentage change in cereal yield from 2011 to 2021, for the country having the highest population in 2021. In this question, you must consider the cereal yield for each year between 2011 and 2021.

In [38]:
# identifying the country with highest populations from 2011 to 2021

high_pop = pop[pop['Country Name'].isin(cereal_geo['Country Name'])].nlargest(1, '2021')['Country Name'].iloc[0]

In [39]:
high_pop

'China'

In [40]:
# creating a dataframe for the identified highest populated country from 2010 to 2021. 2010 is considered so that percentage change for 2011 from 2010 can be used.

cereal1 = cereal[cereal['Country Name'] == high_pop]
cereal1 = cereal1.loc[:, '2010':'2021']

In [41]:
# transposing the cereal dataframe to use it for line plot

cereal1 = cereal1.transpose()
cereal1.columns=['Cereal_yield']
cereal1

Unnamed: 0,Cereal_yield
2010,5526.1
2011,5709.4
2012,5827.1
2013,5894.1
2014,5893.2
2015,5985.7
2016,6017.6
2017,6111.3
2018,6125.4
2019,6265.9


In [42]:
# calculating the percentage change and storing it in a column 'change' in cereal1.

cereal1['change'] = cereal1.pct_change()
cereal1

Unnamed: 0,Cereal_yield,change
2010,5526.1,
2011,5709.4,0.03317
2012,5827.1,0.020615
2013,5894.1,0.011498
2014,5893.2,-0.000153
2015,5985.7,0.015696
2016,6017.6,0.005329
2017,6111.3,0.015571
2018,6125.4,0.002307
2019,6265.9,0.022937


In [43]:
# remove the year 2010 from the cereal1 dataframe

cereal1 = cereal1.drop('2010', axis=0)
cereal1

Unnamed: 0,Cereal_yield,change
2011,5709.4,0.03317
2012,5827.1,0.020615
2013,5894.1,0.011498
2014,5893.2,-0.000153
2015,5985.7,0.015696
2016,6017.6,0.005329
2017,6111.3,0.015571
2018,6125.4,0.002307
2019,6265.9,0.022937
2020,6314.2,0.007708


In [44]:
# plotting the line graph using the cereal1 dataframe for the percentage change in cereal yield from 2011 to 2021

fig = px.line(cereal1, x=cereal1.index, y='change',
              markers=True,
              color_discrete_sequence=['darkseagreen'],
              labels={'change': 'Percentage Change'},
              custom_data=['Cereal_yield', 'change'],
              title="Percentage Change in Cereal Yield (2011-2021)")

fig.update_layout(xaxis_title='Year', yaxis_title='Percentage Change',
                  title_x=0.5, hovermode='x unified')

fig.update_traces(hovertemplate='<b>Year</b>: %{x}<br>'
                                '<b>Cereal Yield</b>: %{customdata[0]:,.2f}<br>'
                                '<b>Change</b>: %{customdata[1]:,.6f}',
                 line=dict(width=2.5),
                 marker=dict(size=8,color='green'))

fig.show()

The cereal yield has generally shown a positive trend over the years, with an increasing percentage change.

In 2011, the cereal yield was 5709.4 kg/ha, and it experienced a steady upward trend over the subsequent years.

The highest percentage change occurred in 2019, with a substantial increase of 2.29%.

Despite a slight dip in 2014, the overall trend remained positive, indicating a consistent improvement in cereal yield.

In 2021, the cereal yield reached 6320.8 kg/ha, reflecting a continuous positive trajectory.

The country has made significant progress in improving cereal yields over the analyzed period, with a focus on agricultural productivity and potentially adopting more efficient farming practices or technologies.

The positive percentage changes signify a positive trend in cereal production, contributing to food security and agricultural sustainability.

### Task 1.2.5:

Present a scatter plot between the mean population of each country and the mean cereal yield from the year 2011 until 2021.

In [45]:
# creating cereal2 dataframe whose countries are available in loc dataframe and inserting a column 'mean' calculating mean ceral yield of countries from 2011 to 2021.

cereal2 = cereal[cereal['iso_a3'].isin(loc['iso_a3'])]
cereal2 = cereal2.set_index(['Country Name'])
cereal2 = cereal2.iloc[:,51:]

cereal2['cereal_mean'] = cereal2.mean(axis=1)
cereal2 = cereal2.iloc[:,11:]
cereal2

Unnamed: 0_level_0,cereal_mean
Country Name,Unnamed: 1_level_1
Afghanistan,2027.327273
Angola,860.363636
Albania,4919.336364
United Arab Emirates,25778.954545
Argentina,4938.290909
...,...
Vanuatu,604.836364
"Yemen, Rep.",857.145455
South Africa,4409.563636
Zambia,2573.190909


In [46]:
# checking for null values in cereal2

cereal2.isna().sum()

Unnamed: 0,0
cereal_mean,0


In [47]:
# creating pop1 dataframe whose countries are available in loc dataframe and inserting a column 'mean' calculating mean population of countries from 2011 to 2021.

pop1 = pop[pop['iso_a3'].isin(loc['iso_a3'])]
pop1 = pop1.set_index(['Country Name'])
pop1 = pop1.iloc[:,52:]

pop1['pop_mean'] = pop1.mean(axis=1)
pop1 = pop1.iloc[:,11:]
pop1

Unnamed: 0_level_0,pop_mean
Country Name,Unnamed: 1_level_1
Afghanistan,3.518085e+07
Angola,2.893316e+07
Albania,2.871830e+06
United Arab Emirates,9.444956e+06
Argentina,4.356846e+07
...,...
Vanuatu,2.783465e+05
"Yemen, Rep.",2.715723e+07
South Africa,5.612482e+07
Zambia,1.640259e+07


In [48]:
# checking for null values in pop1

pop1.isna().sum()

Unnamed: 0,0
pop_mean,0


In [49]:
# creating a dataframe with country name, cereal mean and population mean by merging cereal2 and pop1

mean_df = cereal2.join(pop1)
mean_df.reset_index(inplace=True)
mean_df

Unnamed: 0,Country Name,cereal_mean,pop_mean
0,Afghanistan,2027.327273,3.518085e+07
1,Angola,860.363636,2.893316e+07
2,Albania,4919.336364,2.871830e+06
3,United Arab Emirates,25778.954545,9.444956e+06
4,Argentina,4938.290909,4.356846e+07
...,...,...,...
164,Vanuatu,604.836364,2.783465e+05
165,"Yemen, Rep.",857.145455,2.715723e+07
166,South Africa,4409.563636,5.612482e+07
167,Zambia,2573.190909,1.640259e+07


In [50]:
# checking for null values in merged dataframe

mean_df.isna().sum()

Unnamed: 0,0
Country Name,0
cereal_mean,0
pop_mean,0


In [51]:
# plotting the mean population and mean cereal yield of countries from 2011 to 2021

correlation = mean_df[['cereal_mean', 'pop_mean']].corr().iloc[0, 1]

fig = px.scatter(mean_df, x='cereal_mean', y='pop_mean', color='Country Name', custom_data=['Country Name', 'cereal_mean', 'pop_mean'])

fig.update_traces(hovertemplate='Country: %{customdata[0]}<br>Cereal yield Mean: %{customdata[1]:.2f}<br>Population Mean: %{customdata[2]:,.0f}')
fig.update_layout(title="Population Vs Cereal yield mean of countries from 2011 to 2021",title_x=0.5, annotations=[{'x': 0.5,'y': 1.05,'showarrow': False,'text': f'Correlation: {correlation:.2f}','xref': 'paper','yref': 'paper'}])

fig.show()

The above scatter plot of Population Vs Cereal yield mean of countries from 2011 to 2021 illustrates:

both India and China, with populations exceeding 1 billion, exhibit mean cereal yields below 10,000 kg/ha.

This suggests that despite their large populations, the average productivity in cereal cultivation (yield per hectare) is relatively low.

Larger-population countries might have more pressure on arable land and resources, leading to suboptimal agricultural practices and lower yields.

The United Arab Emirates (UAE), with a population of less than 50 million, stands out with the highest mean cereal yield, surpassing 25,000 kg/ha.

This indicates that the UAE has a more efficient and productive cereal cultivation system, leading to higher yields despite a smaller population.

Smaller-population countries may have more flexibility and resources to implement advanced agricultural technologies and practices, resulting in higher yields.

A general trend is observed where the majority of countries with populations below 50 million tend to have mean cereal yields below 7,000 kg/ha.

This suggests that, on average, smaller populations are associated with lower cereal yields.

While there is a slight tendency for countries with higher populations to have slightly higher mean cereal yields, the correlation(0.06) is weak, suggesting that population size alone is not a strong predictor of cereal yield.

Overall, the data on cereal yield and population can provide valuable insights for policymakers, researchers, and organizations working in agriculture and food security. Understanding the factors influencing these trends can lead to more informed decision-making and sustainable practices in the agricultural sector.
