# Countries of the World


In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:
world = pd.read_csv("../input/countries-of-the-world/countries of the world.csv",decimal=',')

In [None]:
world.describe()

# Data Pre-processing

In [None]:
plt.figure(figsize=(12,10))
sns.barplot(data=world.isnull().sum().reset_index(), y='index',x=0)
plt.ylabel('Variables')
plt.title('Missing Values Plot')
plt.xlabel('Missing value Count')
plt.show()

## The missing data is filled using the median of the region that a country belongs, as countries that are close geologically are often similar in many ways

In [None]:
for col in world.columns.values:
    if world[col].isnull().sum() == 0:
        continue
    if col == 'Climate':
        guess_values = world.groupby('Region')['Climate'].apply(lambda x: x.mode().max())
    else:
        guess_values = world.groupby('Region')[col].median()
    for region in world['Region'].unique():
        world[col].loc[(world[col].isnull())&(world['Region']==region)] = guess_values[region]

In [None]:
world.isnull().sum()

# Data Exploration
## Top Countries with highest GDP per capita

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
sns.barplot(x='Country',y='GDP ($ per capita)',data=world.sort_values('GDP ($ per capita)',ascending=False).head(20), palette='Set3')
plt.xticks(rotation=90)
plt.show()

## Top Countries with lowest GDP per capita

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
sns.barplot(x='Country',y='GDP ($ per capita)',data=world.sort_values('GDP ($ per capita)',ascending=True).head(20), palette='Set3')
plt.xticks(rotation=90)
plt.show()

# Top Countries with highest Total GDP

In [None]:
world['Total_GDP'] = world['GDP ($ per capita)'] * world['Population']
plt.figure(figsize=(14,10))

sns.barplot(x='Country',y='Total_GDP',data=world.sort_values('Total_GDP',ascending=False).head(10),palette='Set3')

# Share of Total GDP of Top 25 countries

In [None]:
plt.subplots(figsize=(14,10))
gdp=world.sort_values('Total_GDP',ascending=False).head(25)
plt.pie(gdp['Total_GDP'],labels=gdp['Country'],autopct='%1.1f%%',shadow=True);

#  Total GDP per capita by Region

In [None]:
plt.figure(figsize=(14,10))
sns.boxplot(x="Region",y="GDP ($ per capita)",data=world,width=0.7,palette="Set3",fliersize=5)
plt.xticks(rotation=90)
plt.title("GDP BY REGİON",color="red")

# Correlation between Variables

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(data=world.iloc[:,2:].corr(),annot=True,fmt='.2f',cmap='coolwarm')
plt.show()

# Top Factors affecting GDP per capita
## We pick the five columns that mostly correlate to GDP per capita and make scatter plots.

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(data=world,x='Infant mortality (per 1000 births)',y='GDP ($ per capita)',hue='Region')

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(data=world,x='Literacy (%)',y='GDP ($ per capita)',hue='Region')

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(data=world,x='Phones (per 1000)',y='GDP ($ per capita)',hue='Region')

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(data=world,x='Agriculture',y='GDP ($ per capita)',hue='Region')

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(data=world,x='Birthrate',y='GDP ($ per capita)',hue='Region')

# Birthrate and Deathrate of Top 100 Countries

In [None]:
plt.figure(figsize=(16,12))
sns.lineplot(data=world.sort_values('GDP ($ per capita)',ascending=False).head(100),x='GDP ($ per capita)',y='Birthrate',ci=None,label='Birthrate')
sns.lineplot(data=world.sort_values('GDP ($ per capita)',ascending=False).head(100),x='GDP ($ per capita)',y='Deathrate',ci=None,label='Deathrate')

# Birthrate and Deathrate of Last 100 Countries

In [None]:
plt.figure(figsize=(16,12))
sns.lineplot(data=world.sort_values('GDP ($ per capita)',ascending=True).head(100),x='GDP ($ per capita)',y='Birthrate',ci=None,label='Birthrate')
sns.lineplot(data=world.sort_values('GDP ($ per capita)',ascending=True).head(100),x='GDP ($ per capita)',y='Deathrate',ci=None,label='Deathrate')

#  Comparison of the economy structure for the ten countries with highest total GDP

In [None]:
plt.figure(figsize=(16,12))
gdp[['Country','Agriculture', 'Industry', 'Service']].set_index('Country').head(10).plot.bar(stacked=True,figsize=(10,6))

# Modeling
# Training and Testing

In [None]:
LE = LabelEncoder()
world['Region_label'] = LE.fit_transform(world['Region'])
world['Climate_label'] = LE.fit_transform(world['Climate'])
world.head()

In [None]:
train, test = train_test_split(world, test_size=0.3, shuffle=True)
training_features = ['Population', 'Area (sq. mi.)',
       'Pop. Density (per sq. mi.)', 'Coastline (coast/area ratio)',
       'Net migration', 'Infant mortality (per 1000 births)',
       'Literacy (%)', 'Phones (per 1000)',
       'Arable (%)', 'Crops (%)', 'Other (%)', 'Birthrate',
       'Deathrate', 'Region_label',
       'Climate_label']
target = 'GDP ($ per capita)'
train_X = train[training_features]
train_Y = train[target]
test_X = test[training_features]
test_Y = test[target]

In [None]:
print(train_X.shape) 
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

# Linear Regression Model

In [None]:
model1 = LinearRegression()
model1.fit(train_X, train_Y)
train_pred_Y = model1.predict(train_X)
test_pred_Y = model1.predict(test_X)

In [None]:
from sklearn.metrics import r2_score

In [None]:
print('Training Score : ',model1.score(train_X,train_Y))
print(f'Test score : ',r2_score(test_pred_Y,test_Y))

## Training Score : 73.71%
## Test score :  76.90%

# Visualization of Results

In [None]:
plt.figure(figsize=(18,9))
whole=model1.predict(world[training_features])
sns.regplot(x=world[target],y=whole) 

# Random Forest Regressor

In [None]:
model = RandomForestRegressor(n_estimators = 100,
                             max_depth = 6,
                             min_weight_fraction_leaf = 0.05,
                             max_features = 0.8,
                             random_state = 42)
model.fit(train_X, train_Y)
train_pred_Y = model.predict(train_X)
test_pred_Y = model.predict(test_X)

In [None]:
print('Training Score :',model.score(train_X,train_Y))
print(f'Test score : ',r2_score(test_pred_Y,test_Y))

## Training Score : 90.36%
## Test score :  76.15%

# Visualization of Results

In [None]:
plt.figure(figsize=(18,9))
whole=model.predict(world[training_features])
sns.regplot(x=world[target],y=whole) 



# We will try to predict GDP per capita of Two Imaginary Countries
# 1.DreamLand : Better than Average in all features
# 2.MeanLand : Mean of all Countries

In [None]:
df = pd.DataFrame(columns = training_features)

In [None]:
df = df.append({'Population':500000000, 'Area (sq. mi.)':3287263,
       'Pop. Density (per sq. mi.)':152.0, 'Coastline (coast/area ratio)':2.0,
       'Net migration':0.0, 'Infant mortality (per 1000 births)':5,
       'Literacy (%)':99.0, 'Phones (per 1000)':1000,
       'Arable (%)':60, 'Crops (%)':10, 'Other (%)':30, 'Birthrate':10,
       'Deathrate':5, 'Region_label':0,
       'Climate_label':0},ignore_index = True) 
df = df.append({'Population':28740280, 'Area (sq. mi.)':598227,
       'Pop. Density (per sq. mi.)':379, 'Coastline (coast/area ratio)':21.16,
       'Net migration':0.03, 'Infant mortality (per 1000 births)':35.28,
       'Literacy (%)':83.58, 'Phones (per 1000)':234.62,
       'Arable (%)':13.7, 'Crops (%)':4.3, 'Other (%)':81.67, 'Birthrate':22.06,
       'Deathrate':9.19, 'Region_label':0,
       'Climate_label':0},ignore_index = True)
df

In [None]:
model.predict(df)

# Predicted GDP per capita
# DreamLand :  \$20946.20
# MeanLand : \$5438.23

# Thank You