In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('/kaggle/input/world-population-growth-rate-by-cities-2024/Wprld population growth rate by cities 2024.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               801 non-null    object 
 1   Country            801 non-null    object 
 2   Continent          790 non-null    object 
 3   Population (2024)  801 non-null    int64  
 4   Population (2023)  801 non-null    int64  
 5   Growth Rate        801 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 37.7+ KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Population (2024),801.0,2654327.0,3723253.0,750036.0,990931.0,1379368.0,2570980.0,37115040.0
Population (2023),801.0,2604461.0,3661201.0,722836.0,969804.0,1363510.0,2514077.0,37194100.0
Growth Rate,801.0,0.02005131,0.01218011,-0.0249,0.0122,0.0197,0.0266,0.0582


In [5]:
df.head()

Unnamed: 0,City,Country,Continent,Population (2024),Population (2023),Growth Rate
0,Tokyo,Japan,Asia,37115035,37194105,-0.0021
1,Delhi,India,Asia,33807403,32941309,0.0263
2,Shanghai,China,Asia,29867918,29210808,0.0225
3,Dhaka,Bangladesh,Asia,23935652,23209616,0.0313
4,Sao Paulo,Brazil,South America,22806704,22619736,0.0083


In [6]:
df.isnull().sum().sort_values(ascending=False)

Continent            11
City                  0
Country               0
Population (2024)     0
Population (2023)     0
Growth Rate           0
dtype: int64

In [7]:
df['Continent'].unique()

array(['Asia', 'South America', 'Africa', 'North America', 'Europe',
       'Oceana', nan, 'Oceania'], dtype=object)

In [8]:
df[df['Continent'].isnull()]

Unnamed: 0,City,Country,Continent,Population (2024),Population (2023),Growth Rate
133,Santo Domingo,Dominican Republic,,3587402,3523890,0.018
146,Ouagadougou,Burkina Faso,,3358934,3203923,0.0484
184,Manchester,United Kingdom,,2811756,2791005,0.0074
187,Brazzaville,Republic of the Congo,,2724566,2637733,0.0329
313,Glasgow,United Kingdom,,1708147,1698088,0.0059
400,Pointe Noire,Republic of the Congo,,1379368,1336387,0.0322
424,Prague,Czech Republic,,1327947,1323339,0.0035
604,Bangui,Central African Republic,,985965,958335,0.0288
628,Southampton,United Kingdom,,959202,951531,0.0081
718,Newcastle Upon Tyne,United Kingdom,,828712,823431,0.0064


In [9]:
df.duplicated().any()

False

In [10]:
df.loc[df['Continent'] == 'Oceana', 'Continent'] = 'Oceania'

In [11]:
con = ['North America', 'Africa', 'Europe', 'Africa', 'Europe', 'Africa', 'Europe', 'Africa', 'Europe', 'Europe', 'Europe']
ind = df[df['Continent'].isnull()].index

for i in range(len(df[df['Continent'].isnull()])):
  df.loc[ind[i], 'Continent'] = con[i]

In [12]:
df['Count'] = 1

In [13]:
fig = px.pie(df, names='Continent', values='Count')
fig.update_layout(legend_title='Continent', title={'text': 'Distribution of Continents', 'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()

In [14]:
for c in df['Continent'].unique():
  fig = px.histogram(df[df['Continent'] == c], x='Count', y='Country', color='Country', color_discrete_sequence=px.colors.qualitative.Dark24).update_yaxes(categoryorder='total ascending')
  fig.update_layout(title={'text': f'Distribution of Country in {c}','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title='Sum of Count')
  fig.show()

In [15]:
for n in ['Population (2023)', 'Population (2024)', 'Growth Rate']:
  fig = px.histogram(df, x=n, y="Count",
                    marginal="box",
                    hover_data=df.columns)
  fig.update_layout(title={'text': f'Distribution of {n}','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'}, yaxis_title='Sum of Count')
  fig.show()

In [16]:
country = df.groupby('Country', as_index=False)[['Growth Rate']].mean()
city = df.groupby('City', as_index=False).agg({'Population (2024)': 'sum', 'Growth Rate': 'mean'})

In [17]:
fig = px.box(df, x='Continent', y='Growth Rate', color='Continent', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Growth Rate by Continent','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [18]:
fig = px.bar(country.sort_values('Growth Rate', ascending=False)[:10], x='Country', y='Growth Rate', color='Country', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 10 Countries with Highest Average Growth Rate','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [19]:
fig = px.bar(city.sort_values('Growth Rate', ascending=False)[:10], x='City', y='Growth Rate', color='City', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 10 Cities with Highest Average Growth Rate','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [20]:
fig = px.bar(country.sort_values('Growth Rate')[:10], x='Country', y='Growth Rate', color='Country', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 10 Countries with Lowest Growth Rate','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [21]:
fig = px.bar(city.sort_values('Growth Rate')[:10], x='City', y='Growth Rate', color='City', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 10 Cities with Lowest Growth Rate','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [22]:
fig = px.bar(city.sort_values('Population (2024)', ascending=False)[:5], x='City', y='Population (2024)', color='City', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 5 Cities with Highest Population (2024)','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [23]:
fig = px.bar(city.sort_values('Population (2024)')[:5], x='City', y='Population (2024)', color='City', color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title={'text': 'Top 5 Cities with Lowest Population (2024)','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [24]:
df_num = df.drop(columns=['Count']).select_dtypes(include=np.number)
fig = px.imshow(df_num.corr())
fig.update_layout(title={'text': 'Correlation Between Numerical Attributes','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [25]:
X = df[['Population (2023)', 'Growth Rate']]
y = df['Population (2024)']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'R2 Score: {round(r2_score(y_test, y_pred)*100, 2)}%')
print(f'Mean Absolute Error: {round(mean_absolute_error(y_test, y_pred), 2)}')
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}\n')

R2 Score: 99.99%
Mean Absolute Error: 18960.55
Root Mean Squared Error: 30038.44

