In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as pltimg

In [None]:
cia = pd.read_csv('../../../DATA/CIA_Country_Facts.csv')
cia.head()

In [None]:
cia.describe().transpose()

In [None]:
sns.histplot(data=cia[cia['Population']<200000000],x='Population')

In [None]:
plt.figure(figsize=(12,8),dpi=250)
sns.barplot(data=cia,y='GDP ($ per capita)',x='Region')
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(10,8),dpi=250)
sns.scatterplot(data=cia,x='GDP ($ per capita)',y='Phones (per 1000)',hue='Region')
plt.legend(loc=(1.05,0.5))

## Exploring outliers

In [None]:
cia[cia['GDP ($ per capita)']>50000]

In [None]:
plt.figure(figsize=(12,8),dpi=250)
sns.scatterplot(data=cia,x='GDP ($ per capita)',y='Literacy (%)',hue='Region')
plt.legend(loc=(1.05,0.5))

In [None]:
plt.figure(dpi=250)
sns.heatmap(cia.corr())

In [None]:
sns.clustermap(cia.corr())

## Data preparation and discovery

In [None]:
# pct = 100 * (cia.isnull().sum() / len(cia))
# pct

In [None]:
cia[cia['Agriculture'].isnull()]['Country']

In [None]:
cia[cia['Agriculture'].isnull()] = cia[cia['Agriculture'].isnull()].fillna(0)

In [None]:
cia[cia['Agriculture'].isnull()]['Country']

In [None]:
# cia.isnull().sum()
# pct = 100 * (cia.isnull().sum() / len(cia))
# pct

In [None]:
cia['Climate'] = cia['Climate'].fillna(cia.groupby('Region')['Climate'].transform('mean'))

In [None]:
cia['Literacy (%)'] = cia['Literacy (%)'].fillna(cia.groupby('Region')['Literacy (%)'].transform('mean'))

In [None]:
pct = 100 * (cia.isnull().sum() / len(cia))
pct

In [None]:
cia = cia.dropna()

In [None]:
cia.info()

## Data feature preparation

In [None]:
X = cia.drop('Country',axis=1)
X = pd.get_dummies(X)
X

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X

In [None]:
from sklearn.cluster import KMeans

In [None]:
ssd = []
for k in range(2,30):
  model = KMeans(n_clusters=k)
  model.fit(scaled_X)
  
  ssd.append(model.inertia_)

In [None]:
ssd

In [None]:
plt.plot(range(2,30),ssd,'o--')

In [None]:
pd.Series(ssd).diff().plot(kind='bar')