# Cas d'usage avec un jeu de données

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pour pouvoir exécuter la cellule suivante, vous devrez placer dans le répertoire où est exécuté votre notebook le fichier `Indicators.csv` téléchargeable sur https://www.kaggle.com/worldbank/world-development-indicators/data

In [None]:
data = pd.read_csv('Indicators.csv')
data.shape

In [None]:
data.head()

In [None]:
countries = data['CountryName'].unique().tolist()
len(countries)

In [None]:
codes = data['CountryCode'].unique().tolist()
len(codes)

In [None]:
indicators = data['IndicatorName'].unique().tolist()
len(indicators)

In [None]:
years = data['Year'].unique().tolist()
len(years)

In [None]:
min(years), max(years)

Filtrons les données pour avoir les émissions de CO2 des Etats-Unis.

In [None]:
indicator = 'CO2 emissions \(metric'
country = 'USA'

mask1 = data['IndicatorName'].str.contains(indicator)
mask2 = data['CountryCode'].str.contains(country)

selection = data[mask1 & mask2]
selection.head()

In [None]:
len(selection['IndicatorName'].unique())

In [None]:
years = selection['Year'].values
values = selection['Value'].values

plt.figure()
plt.bar(years,values)

In [None]:
plt.figure()
plt.plot(years,values)
plt.xlabel('Year')
plt.ylabel(selection['IndicatorName'].iloc[0])
plt.title('Emission de C02 aux Etats-Unis')
plt.axis([min(years), max(years),0,max(values)])

In [None]:
len(values)

In [None]:
plt.figure()
plt.hist(values,10,density=False,facecolor='green')
plt.xlabel(selection['IndicatorName'].iloc[0])
plt.ylabel("Nombre d'années")
plt.title('Histogramme')
plt.grid(True)

Comparons les emissions de CO2 des Etats-Unis avec d'autres pays en 2011.

In [None]:
year = 2011

mask1 = data['IndicatorName'].str.contains(indicator)
mask2 = data['Year'].isin([year])

selection = data[mask1 & mask2]

selection.head()

In [None]:
len(selection)

In [None]:
fig, ax = plt.subplots()

plt.hist(selection['Value'].values, 10, density=False, facecolor='green')
plt.xlabel(selection['IndicatorName'].iloc[0])
plt.ylabel('Nombre de pays')
plt.title('Histogramme des émissions de CO2 par habitant')
plt.grid(True)

In [None]:
val_for_usa = selection[selection['CountryCode'].str.contains('USA')]['Value'].values[0]
print(val_for_usa)

In [None]:
ax.annotate("USA", 
            xy=(val_for_usa,5), xycoords='data', 
            xytext=(val_for_usa,30), textcoords='data',
            arrowprops=dict(arrowstyle='->', connectionstyle='arc3')
            )

In [None]:
indicator = 'CO2 emissions \(metric'
mask1 = data['IndicatorName'].str.contains(indicator)

indicator = 'GDP per capita \(constant 2005'
mask2 = data['IndicatorName'].str.contains(indicator)

mask3 = data['CountryCode'].str.contains(country)

selection1 = data[mask1 & mask3]
selection1.head()

In [None]:
selection2 = data[mask2 & mask3]
selection2.head()

In [None]:
plt.figure()
plt.plot(selection2['Year'].values,selection2['Value'].values)
plt.xlabel('Year')
plt.ylabel(selection2['IndicatorName'].iloc[0])
plt.title('GDP per capita in USA')

In [None]:
selection1['Year'].min(), selection1['Year'].max()

In [None]:
selection2['Year'].min(), selection2['Year'].max()

In [None]:
minYear = max(selection1['Year'].min(),selection2['Year'].min())
minYear

In [None]:
maxYear = min(selection1['Year'].max(),selection2['Year'].max())
maxYear

In [None]:
mask1 = selection1['Year']>=minYear
mask2 = selection1['Year']<=maxYear
selection1_trunc = selection1[mask1 & mask2]

mask1 = selection2['Year']>=minYear
mask2 = selection2['Year']<=maxYear
selection2_trunc = selection2[mask1 & mask2]

In [None]:
len(selection1_trunc), len(selection2_trunc)

In [None]:
plt.figure()
fig, ax = plt.subplots()
ax.yaxis.grid(True)
ax.set_title('Emission de CO2 par habitant par rapport au PIB')
ax.set_xlabel(selection2_trunc['IndicatorName'].iloc[0],fontsize=10)
ax.set_ylabel(selection1['IndicatorName'].iloc[0],fontsize=10)

X = selection2_trunc['Value']
Y = selection1_trunc['Value']

ax.scatter(X, Y)

In [None]:
np.corrcoef(selection2_trunc['Value'],selection1_trunc['Value'])