In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/world-population-data/world_population_data.csv
/kaggle/input/world-data-population/world_population_data.csv


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px
from datetime import datetime as dt

# 1. Data

In [3]:
df = pd.read_csv('/kaggle/input/world-population-data/world_population_data.csv')
df.shape


(234, 17)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rank              234 non-null    int64  
 1   cca3              234 non-null    object 
 2   country           234 non-null    object 
 3   continent         234 non-null    object 
 4   2023 population   234 non-null    int64  
 5   2022 population   234 non-null    int64  
 6   2020 population   234 non-null    int64  
 7   2015 population   234 non-null    int64  
 8   2010 population   234 non-null    int64  
 9   2000 population   234 non-null    int64  
 10  1990 population   234 non-null    int64  
 11  1980 population   234 non-null    int64  
 12  1970 population   234 non-null    int64  
 13  area (km²)        234 non-null    float64
 14  density (km²)     234 non-null    int64  
 15  growth rate       234 non-null    object 
 16  world percentage  234 non-null    object 
dt

In [5]:
for col in df.columns:
    print('_'*50)
    print(col)
    print(f'uniques: {df[col].nunique()}')


__________________________________________________
rank
uniques: 234
__________________________________________________
cca3
uniques: 234
__________________________________________________
country
uniques: 234
__________________________________________________
continent
uniques: 6
__________________________________________________
2023 population
uniques: 234
__________________________________________________
2022 population
uniques: 234
__________________________________________________
2020 population
uniques: 234
__________________________________________________
2015 population
uniques: 234
__________________________________________________
2010 population
uniques: 234
__________________________________________________
2000 population
uniques: 234
__________________________________________________
1990 population
uniques: 234
__________________________________________________
1980 population
uniques: 234
__________________________________________________
1970 population
uniques: 23

In [6]:
df.head(3)

Unnamed: 0,rank,cca3,country,continent,2023 population,2022 population,2020 population,2015 population,2010 population,2000 population,1990 population,1980 population,1970 population,area (km²),density (km²),growth rate,world percentage
0,1,IND,India,Asia,1428627663,1417173173,1396387127,1322866505,1240613620,1059633675,870452165,696828385,557501301,3287590.0,481,0.81%,17.85%
1,2,CHN,China,Asia,1425671352,1425887337,1424929781,1393715448,1348191368,1264099069,1153704252,982372466,822534450,9706961.0,151,-0.02%,17.81%
2,3,USA,United States,North America,339996563,338289857,335942003,324607776,311182845,282398554,248083732,223140018,200328340,9372610.0,37,0.50%,4.25%


# 2. Global Population

In [7]:
# World Population
global_pop = df['2023 population'].sum()
print( f'{global_pop:,}')


8,043,615,390


In [8]:
px.bar(df.groupby('continent')['2023 population'].sum().reset_index(name='Population'),
       x='continent', 
       y='Population',
       title='Population by continent',
       height=400,
       width=800)

# 2. Time Series Analysis

In [9]:
cols = df.columns[4:13]
df2 = df.groupby(['country','continent'])[cols].sum().stack().reset_index()
df2.columns=['country','continent', 'year', 'population']
df2['year'] = df2['year'].str[0:4]
df2['year'] = pd.to_datetime(df2['year'], format='%Y')
df2.head(3)

Unnamed: 0,country,continent,year,population
0,Afghanistan,Asia,2023-01-01,42239854
1,Afghanistan,Asia,2022-01-01,41128771
2,Afghanistan,Asia,2020-01-01,38972230


In [10]:
px.line(df2.groupby(['continent','year'])['population'].sum().reset_index(),
        x='year', 
        y='population', 
        color='continent',
        title='Population growth',
        height=600,
        width=1000        
       )

In [11]:
continents = ['Asia', 'Europe', 'Africa', 'Oceania', 'North America','South America']
for continent in continents:
    fig = px.line(df2.query(f"continent=='{continent}'").groupby(['country','year'])['population'].sum().reset_index(),
            x='year', 
            y='population', 
            color='country',
            title=f'Population growth by {continent} Countries',
            height=800,
            width=1000        
           )
    fig.show()

# 3. World Population Variation

In [12]:
df3 = df2.groupby('year')['population'].sum().reset_index()
df3['population variation'] = df3['population'].diff().fillna(0)
px.line(df3, 
        x='year',
        y='population variation',
        title= 'population variation',
        height=600,
        width=900)


In [13]:
df4 = df2.groupby(['continent','year'])['population'].sum().reset_index()
df5 = pd.DataFrame()

continents = ['Asia', 'Europe', 'Africa', 'Oceania', 'North America','South America']
for continent in continents:
    new_table = df4.query(f"continent=='{continent}'").groupby(['continent','year'])['population'].sum().reset_index()
    new_table['variation'] = new_table['population'].diff().fillna(0)
    df5 = pd.concat([df5,new_table])
    
fig = px.line(df5, 
              x='year',
              y='variation',
              color='continent',
              title= 'population variation by continent',
              height=600,
              width=900)
fig.show()


# 5. Map Visualization

In [14]:
df6 = df2.query("year=='2023-01-01'")
df6

Unnamed: 0,country,continent,year,population
0,Afghanistan,Asia,2023-01-01,42239854
9,Albania,Europe,2023-01-01,2832439
18,Algeria,Africa,2023-01-01,45606480
27,American Samoa,Oceania,2023-01-01,43914
36,Andorra,Europe,2023-01-01,80088
...,...,...,...,...
2061,Wallis and Futuna,Oceania,2023-01-01,11502
2070,Western Sahara,Africa,2023-01-01,587259
2079,Yemen,Asia,2023-01-01,34449825
2088,Zambia,Africa,2023-01-01,20569737


In [15]:
fig = px.choropleth(df6.groupby('country')['population'].sum().reset_index(), 
                    locations='country',
                    locationmode='country names',
                    color= 'population',
                    hover_name='country',
                    color_continuous_scale='Rainbow')
fig.update_geos(projection_type="natural earth", showcoastlines=True)
fig.update_layout(title_text='World Map - population')
fig.show()