In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


Load the data

In [2]:
data = pd.read_csv("/content/drive/MyDrive/Dataset/FAOSTAT_data_en_5-14-2023.csv")
data.head()

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QCL,Crops and livestock products,4,Afghanistan,5510,Production,113,Rice,2017,2017,tonnes,338420.0,A,Official figure
1,QCL,Crops and livestock products,4,Afghanistan,5510,Production,113,Rice,2018,2018,tonnes,352177.0,A,Official figure
2,QCL,Crops and livestock products,4,Afghanistan,5510,Production,113,Rice,2019,2019,tonnes,382500.0,A,Official figure
3,QCL,Crops and livestock products,4,Afghanistan,5510,Production,113,Rice,2020,2020,tonnes,439549.0,A,Official figure
4,QCL,Crops and livestock products,4,Afghanistan,5510,Production,113,Rice,2021,2021,tonnes,458571.64,I,Imputed value


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702 entries, 0 to 701
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Domain Code       702 non-null    object 
 1   Domain            702 non-null    object 
 2   Area Code (M49)   702 non-null    int64  
 3   Area              702 non-null    object 
 4   Element Code      702 non-null    int64  
 5   Element           702 non-null    object 
 6   Item Code (CPC)   702 non-null    int64  
 7   Item              702 non-null    object 
 8   Year Code         702 non-null    int64  
 9   Year              702 non-null    int64  
 10  Unit              702 non-null    object 
 11  Value             702 non-null    float64
 12  Flag              702 non-null    object 
 13  Flag Description  702 non-null    object 
dtypes: float64(1), int64(5), object(8)
memory usage: 76.9+ KB


In [3]:
## Let's check out the data using the describe method

data.describe()

Unnamed: 0,Area Code (M49),Element Code,Item Code (CPC),Year Code,Year,Value
count,702.0,702.0,702.0,702.0,702.0,702.0
mean,427.47151,5510.0,113.0,2019.051282,2019.051282,6965645.0
std,253.4512,0.0,0.0,1.396017,1.396017,30145580.0
min,4.0,5510.0,113.0,2017.0,2017.0,0.0
25%,196.0,5510.0,113.0,2018.0,2018.0,2690.59
50%,413.5,5510.0,113.0,2019.0,2019.0,160292.5
75%,643.0,5510.0,113.0,2020.0,2020.0,1243050.0
max,894.0,5510.0,113.0,2021.0,2021.0,214429900.0


In [8]:
for col in data.columns:
    print(f"Column: {col}\nNumber of Unique elements: {data[col].nunique()}")
    print(f"Unique elements: {data[col].unique()}\n" + "--" * 80)

Column: Domain Code
Number of Unique elements: 1
Unique elements: ['QCL']
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Column: Domain
Number of Unique elements: 1
Unique elements: ['Crops and livestock products']
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Column: Area Code (M49)
Number of Unique elements: 144
Unique elements: [  4   8  12  24  32  51  36  31  50  56  84 204  64  68  76  96 100 854
 108 116 120 140 148 152 159 344 156 158 170 174 178 188 384 191 192 196
 203 408 180 208 214 218 818 222 233 748 231 242 246 250 266 270 276 288
 300 320 324 624 328 332 340 348 356 360 364 368 372 380 388 392 398 404
 417 418 428 430 440 442 450 454 458 466 470 478 480 484 583 504 508 104
 524 528 554 558 562 566 807 586 591 598 600 604 608 616 620 630 410 498


Let's first drop a few column and change a few column names

In [11]:
data.columns

Index(['Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code (CPC)', 'Item', 'Year Code', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description'],
      dtype='object')

In [13]:
data.drop(['Domain Code', 'Domain', 'Element Code', 'Item Code (CPC)', 'Item', 'Year Code', 'Unit'], axis=1, inplace=True)

In [14]:
print(data.columns)

Index(['Area Code (M49)', 'Area', 'Element', 'Year', 'Value', 'Flag',
       'Flag Description'],
      dtype='object')


In [15]:
data.head(1)

Unnamed: 0,Area Code (M49),Area,Element,Year,Value,Flag,Flag Description
0,4,Afghanistan,Production,2017,338420.0,A,Official figure


In [19]:
data.rename(columns={'Value': 'Value (tonnes)'}, inplace=True)

In [20]:
data.head()

Unnamed: 0,Area Code (M49),Area,Element,Year,Value (tonnes),Flag,Flag Description
0,4,Afghanistan,Production,2017,338420.0,A,Official figure
1,4,Afghanistan,Production,2018,352177.0,A,Official figure
2,4,Afghanistan,Production,2019,382500.0,A,Official figure
3,4,Afghanistan,Production,2020,439549.0,A,Official figure
4,4,Afghanistan,Production,2021,458571.64,I,Imputed value


In [22]:
data[data['Value (tonnes)'] == data['Value (tonnes)'].max()]

Unnamed: 0,Area Code (M49),Area,Element,Year,Value (tonnes),Flag,Flag Description
119,159,China,Production,2017,214429900.0,E,Estimated value


What are the top 5 countries that produces rice in terms of quantity

In [28]:
data.loc[data['Year'] == 2021].sort_values(by='Value (tonnes)', ascending=False).head(10)

Unnamed: 0,Area Code (M49),Area,Element,Year,Value (tonnes),Flag,Flag Description
123,159,China,Production,2021,214403900.0,E,Estimated value
133,156,"China, mainland",Production,2021,212843000.0,A,Official figure
306,356,India,Production,2021,195425000.0,A,Official figure
44,50,Bangladesh,Production,2021,56944550.0,I,Imputed value
311,360,Indonesia,Production,2021,54415290.0,A,Official figure
691,704,Viet Nam,Production,2021,43852730.0,A,Official figure
626,764,Thailand,Production,2021,33582000.0,A,Official figure
436,104,Myanmar,Production,2021,24910000.0,T,Unofficial figure
500,608,Philippines,Production,2021,19960170.0,A,Official figure
475,586,Pakistan,Production,2021,13984010.0,A,Official figure
