# Correction for the TP: Introduction to Python

In [1]:
%matplotlib notebook
import math                         # import a package
import numpy as np                  # import a package with specified name
from sklearn import linear_model    # import whole module
from os import mkdir, rmdir, path   # import a function
from inspect import getsourcelines  # get function source code
import random
import string
import time
import pandas as pd
import urllib
import zipfile
import matplotlib.pyplot as plt     # for plots
import seaborn as sns               # for plots
import sys

sns.set_palette("colorblind")
color_blind_list = sns.color_palette("colorblind", 8)

# 1) Introduction: Python, Numpy et Scipy

## Question 1 : 
Écrire une fonction ```nextpower``` qui calcule la première puissance de 2 supérieure ou égale à un nombre $n$ (on veillera a ce que le type de sortie soit un <font color='red'>**int**</font>, tester cela avec <font color='red'>**type**</font> par exemple).

In [2]:
# Two possible solutions, non-exhaustive (but check at least on different values that they give the same results)
def nextpower(n):
    """Compute the next power of 2 of n."""
    return int(2 ** (math.ceil(math.log(n, 2))))


def nextpowerbis(n):
    """Compute the next power of 2 of n."""
    sol = 1
    while sol < n:
        sol *= 2
    return sol

In [3]:
print(nextpower(15))
print(nextpowerbis(15))
print(type(nextpower(15)))
print(type(nextpowerbis(15)))

t = time.time()
print(nextpower(10200000))
print(time.time() - t)

t = time.time()
print(nextpowerbis(10200000))
print(time.time() - t)


16
16
<class 'int'>
<class 'int'>
16777216
6.508827209472656e-05
16777216
6.699562072753906e-05


## Question 2 :
En partant du mot contenant toutes les lettres de l'alphabet, générer par une opération de *slicing* la chaîne de caractère ```cfilorux``` et, de deux façons différentes, la chaîne de caractère ```vxz```.

In [4]:
alphabet = string.ascii_lowercase
print(alphabet[2:25:3])
print(alphabet[21:26:2])
print(alphabet[-5::2])


cfilorux
vxz
vxz


## Question 3 :
Afficher le nombre $\pi$ avec 9 décimales après la virgule.

In [5]:
s = "Le nombre %s est égal à %.9f"
print(s % ("pi", math.pi))

Le nombre pi est égal à 3.141592654


## Question 4 :
Compter le nombre d’occurrences de chaque caractère dans la chaîne de caractères ```s="HelLo WorLd!!!" ```. On renverra un dictionnaire qui à chaque lettre associe son nombre d’occurrences.

In [6]:
s_new = "HelLo WorLd!!!"
params = {}

for letter in s_new:
    # print(letter)
    if letter not in params:
        params[letter] = 1
    else:
        params[letter] += 1
        
print(params)

{'e': 1, ' ': 1, '!': 3, 'r': 1, 'W': 1, 'l': 1, 'd': 1, 'L': 2, 'o': 2, 'H': 1}


## Question 5 :
Écrire une fonction de codage par inversion de lettres (aussi connu sous le nom de code de César): chaque lettre d'un mot est remplacée par une (et une seule) autre. On se servira de la fonction ```shuffle``` sur la chaîne de caractère contenant tout l'alphabet.

In [7]:
alphabet = string.ascii_letters
print(alphabet)
type(alphabet)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


str

In [8]:
indexes = list(range(len(alphabet)))
random.shuffle(indexes)

rosette_forward = {}
rosette_backward = {}
for i, letter in enumerate(alphabet):
    rosette_forward[letter] = alphabet[indexes[i]]
    rosette_backward[alphabet[indexes[i]]] = letter

rosette_forward[" "] = " "
rosette_backward[" "] = " "

In [9]:
def coding(message, rosette, alphabet):
    """encodage"""
    message_coded = ""
    for i, letter in enumerate(message):
        if letter in alphabet:
            message_coded += rosette[letter]
        else:
            message_coded += letter
    message_coded = message_coded
    return message_coded


message = "Tu les crois malades? Ça gémit ça rote... ça titube..." \
    "ça pustule... Tu veux vider ta salle d'attente? Instantanément? " \
    "même de ceux qui s'en étranglent à se ramoner les glaviots ?..." \
    "Propose un coup de cinéma!... un apéro gratuit en face!... " \
    "tu vas voir combien qu'il t'en reste...S'ils viennent te relancer " \
    "T'en vois pas un la veille des fêtes..."

# message = "Tu les crois malades ?".decode('utf8')
print("Message original:\n" + message + "\n")
message_encoded = coding(message, rosette_forward, alphabet)
print("Message codé:" + "\n"+ message_encoded + "\n")
message_decoded = coding(message_encoded, rosette_backward, alphabet)
print("Message décodé:" + "\n"+ message_decoded  + "\n")
print("Est ce que le message décodé est identique à l'original? " + str(message_decoded==message) )


Message original:
Tu les crois malades? Ça gémit ça rote... ça titube...ça pustule... Tu veux vider ta salle d'attente? Instantanément? même de ceux qui s'en étranglent à se ramoner les glaviots ?...Propose un coup de cinéma!... un apéro gratuit en face!... tu vas voir combien qu'il t'en reste...S'ils viennent te relancer T'en vois pas un la veille des fêtes...

Message codé:
og qbO SIMlO EfqfFbO? Çf héElZ çf IMZb... çf ZlZgsb...çf xgOZgqb... og NbgQ NlFbI Zf Ofqqb F'fZZbUZb? wUOZfUZfUéEbUZ? EêEb Fb SbgQ Jgl O'bU éZIfUhqbUZ à Ob IfEMUbI qbO hqfNlMZO ?...kIMxMOb gU SMgx Fb SlUéEf!... gU fxéIM hIfZglZ bU zfSb!... Zg NfO NMlI SMEslbU Jg'lq Z'bU IbOZb...e'lqO NlbUUbUZ Zb IbqfUSbI o'bU NMlO xfO gU qf Nblqqb FbO zêZbO...

Message décodé:
Tu les crois malades? Ça gémit ça rote... ça titube...ça pustule... Tu veux vider ta salle d'attente? Instantanément? même de ceux qui s'en étranglent à se ramoner les glaviots ?...Propose un coup de cinéma!... un apéro gratuit en face!... tu vas voir combie

PS: un peu de lecture sur l'utf8: http://sametmax.com/lencoding-en-python-une-bonne-fois-pour-toute/

## Question 6 :
Calculer $ 2 \displaystyle\prod_{k=1}^{\infty}\frac{4 k^2}{4k^2-1}$ efficacement. On pourra utiliser ```time``` (ou ```%timeit``` pour déterminer la rapidité de votre méthode. Proposer une version sans boucle utilisant ```Numpy```.

In [10]:
def wallis1(n):
    """Wallis: naive version."""
    result = 2.
    for k in range(1, n + 1, 1):
        result *= 4. * k ** 2 / (4. * k ** 2. - 1.)
    return result

def wallis2(n):
    """Wallis: better version."""
    result = 2.
    for k in range(1, n + 1, 1):
        int = 4 * k ** 2
        result *= int / (int - 1.)
    return result


def wallis3(n):
    """Wallis: lame version."""
    if n == 1:
        return 8. / 3.
    else:
        sqrpart = 4 * n ** 2
        return (sqrpart / (sqrpart - 1.) * wallis3(n - 1))
print(wallis3(50))


memo = {1: 8. / 3.}


def wallis4(n):
    """Wallis: storage version."""
    sqrpart = 4 * n ** 2
    if n not in memo:
        memo[n] = wallis4(n - 1) * sqrpart / (sqrpart - 1)
    return memo[n]

t = time.time()
print(wallis4(50))
print(time.time() - t)


def wallis5(n):
    """Wallis: no loop version."""
    sol = np.arange(1, n + 1)
    sol = sol ** 2
    sol = 4. * sol / (4. * sol - 1.)
    return 2 * np.prod(sol)



3.126078900215411
3.1260789002154112
0.0001251697540283203


In [11]:
print("Wallis version 1")
t = time.time()
print(wallis1(500))
print(time.time() - t)

print("Wallis version 2")
t = time.time()
print(wallis2(500))
print(time.time() - t)

print("Wallis version 3")
t = time.time()
print(wallis3(500))
print(time.time() - t)

print("Wallis version 4")
t = time.time()
print(wallis4(500))
print(time.time() - t)

print("Wallis version 5")
t = time.time()
print(wallis5(500))
print(time.time() - t)

Wallis version 1
3.1400238186006004
0.0004520416259765625
Wallis version 2
3.1400238186006004
0.00047707557678222656
Wallis version 3
3.1400238186006004
0.0010530948638916016
Wallis version 4
3.140023818600592
0.0008730888366699219
Wallis version 5
3.1400238186006004
0.0003159046173095703


Les commandes "magic" de type %timeit dans Jupyter peuvent aussi s'averer utiles:

In [12]:
%timeit wallis1(500)

310 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%timeit wallis2(500)

266 µs ± 4.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%timeit wallis3(500)

345 µs ± 1.69 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%timeit wallis4(500)

597 ns ± 2.19 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [16]:
%timeit wallis5(500)

15.5 µs ± 716 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Question 7 :
Créer une fonction ```quicksort``` qui trie une liste, en remplissant les éléments manquants dans le code suivant. On testera que la fonction est correcte sur l'exemple ```quicksort([-2, 3, 5, 1, 3])```:

In [17]:
def quicksort(ll):
    """A sorting function with a pivot value."""
    if len(ll) <= 1:
        return ll
    else:
        pivot = ll.pop()
        less = []
        greater = []
        for x in ll:
            if x <= pivot:
                less.append(x)
            else:
                greater.append(x)
        return quicksort(less) + [pivot] + quicksort(greater)

In [18]:
print(quicksort([-2, 3, 5, 1, 3]))

[-2, 1, 3, 3, 5]


## Question 8 : 
Sans utiliser de boucles ```for / while ```:
créer une matrice $M \in \mathbb{R}^{5\times6}$ aléatoire à coefficients uniformes dans $[-1, 1]$, puis remplacer une colonne sur deux par sa valeur moins le double de la colonne suivante. Remplacer enfin les valeurs négatives par 0 en utilisant un masque binaire.

In [19]:
matrice = np.random.uniform(-1, 1, (5, 6))
print(matrice)

# second solution: column manipulation
m_sol1 = matrice.copy()
m_sol1[:, 0] = matrice[:, 0] - 2 * matrice[:, 1]
m_sol1[:, 2] = matrice[:, 2] - 2 * matrice[:, 3]
m_sol1[:, 4] = matrice[:, 4] - 2 * matrice[:, 5]
# zeroing negative values
m_sol1[m_sol1 <= 0] = 0
print(m_sol1)
print(m_sol1)

[[-0.18812855  0.53701568  0.66927081 -0.45528041  0.7033969  -0.09248132]
 [-0.35334935  0.9221197  -0.4166231  -0.13011516 -0.2047255   0.95765414]
 [ 0.19277357  0.15331259 -0.26999358 -0.096663   -0.26773127  0.94143655]
 [ 0.17514051  0.12392128  0.67413943 -0.14637633  0.07703954  0.5795254 ]
 [ 0.04846732  0.64108473  0.50876828  0.45959557 -0.94603988 -0.10260943]]
[[0.         0.53701568 1.57983164 0.         0.88835954 0.        ]
 [0.         0.9221197  0.         0.         0.         0.95765414]
 [0.         0.15331259 0.         0.         0.         0.94143655]
 [0.         0.12392128 0.96689209 0.         0.         0.5795254 ]
 [0.         0.64108473 0.         0.45959557 0.         0.        ]]
[[0.         0.53701568 1.57983164 0.         0.88835954 0.        ]
 [0.         0.9221197  0.         0.         0.         0.95765414]
 [0.         0.15331259 0.         0.         0.         0.94143655]
 [0.         0.12392128 0.96689209 0.         0.         0.5795254 ]
 [

In [20]:
# second solution: matrix multiplication.
m_int = np.eye(6, 6)
m_int[1, 0] = -2
m_int[3, 2] = -2
m_int[5, 4] = -2
print(m_int)
m_sol2 = matrice.dot(m_int)
# zeroing negative values
m_sol2[m_sol2 <= 0] = 0
print(m_sol2)
np.allclose(m_sol1, m_sol2)

[[ 1.  0.  0.  0.  0.  0.]
 [-2.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.]
 [ 0.  0. -2.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0. -2.  1.]]
[[0.         0.53701568 1.57983164 0.         0.88835954 0.        ]
 [0.         0.9221197  0.         0.         0.         0.95765414]
 [0.         0.15331259 0.         0.         0.         0.94143655]
 [0.         0.12392128 0.96689209 0.         0.         0.5795254 ]
 [0.         0.64108473 0.         0.45959557 0.         0.        ]]


True

## Question 9 :
Créer une matrice $M \in \mathbb{R}^{5\times 20}$ aléatoire à coefficients uniformes dans $[-1, 1]$. Tester que $G=M^\top M$ est symétrique et que ses valeurs propres sont positives (on parle de alors de matrice définie positive). Quel est le rang de $G$?
  
**Aide**: on utilisera par exemple ```np.allclose, np.logical_not, np.all``` pour les tests numériques.

In [21]:
matrice = np.random.uniform(-1, 1, (5, 20))
G = matrice.T.dot(matrice)
print(np.allclose(G, G.T))

# Bad solution: Remark that eig could give weird complex values...
# this not satisfactory if the matrix G is known to be symmetric
eig_val_bad, _ = np.linalg.eig(G)
print(eig_val_bad)

# solution 1: Remark that eig could give weird complex values...
# this not satisfactory if the matrix G is known to be symmetric
eig_val, _ = np.linalg.eigh(G)

# solution 2: no need of eigenvectors
eig_val = np.linalg.eigvalsh(G)

idx = eig_val.argsort()[::-1]   
print(eig_val[idx])


True
[ 1.22380048e+01+0.00000000e+00j  7.84461005e+00+0.00000000e+00j
  6.48565153e+00+0.00000000e+00j  4.76985006e+00+0.00000000e+00j
  2.29191245e+00+0.00000000e+00j -6.81526526e-16+0.00000000e+00j
 -6.42118503e-16+0.00000000e+00j  4.29724489e-16+9.45535876e-17j
  4.29724489e-16-9.45535876e-17j -3.28763449e-16+4.33664065e-17j
 -3.28763449e-16-4.33664065e-17j  2.17009264e-16+0.00000000e+00j
  6.43869148e-17+1.67206387e-16j  6.43869148e-17-1.67206387e-16j
 -1.09805219e-16+6.04382266e-17j -1.09805219e-16-6.04382266e-17j
  3.92464737e-17+1.00963864e-16j  3.92464737e-17-1.00963864e-16j
 -5.80328893e-17+0.00000000e+00j  5.36459580e-17+0.00000000e+00j]
[ 1.22380048e+01  7.84461005e+00  6.48565153e+00  4.76985006e+00
  2.29191245e+00  1.52720258e-15  1.07114466e-15  9.70505819e-16
  5.61981940e-16  2.64120453e-16  1.59308480e-16  2.36491362e-17
 -8.08517355e-18 -8.35733212e-17 -1.77113463e-16 -2.74879306e-16
 -4.04059405e-16 -5.57872966e-16 -7.65544943e-16 -1.07493658e-15]


In [22]:
# Beware some are numerically 0, but negative!
# Hence we only store the one being non numercially zero (at relative precision 1e-12)
nnz_eig = np.logical_not(np.isclose(eig_val, np.zeros(eig_val.shape), rtol=1e-12,))

# positivity of non-zero eigenvalues:
eig_nnz_val = eig_val[nnz_eig]
print("The non-zero eigenvalues are:" +  str(eig_nnz_val))
print('Are all eigenvalues positive? That is ' + str(np.all(eig_nnz_val > 0)))

The non-zero eigenvalues are:[ 2.29191245  4.76985006  6.48565153  7.84461005 12.23800485]
Are all eigenvalues positive? That is True


In [23]:
# rank computation:
rk = str(np.sum(nnz_eig))
print('The rank is : ' + rk)
print('The rank is also: ' + str(np.linalg.matrix_rank(G)))

The rank is : 5
The rank is also: 5


# 2)  Introduction: Pandas, Matplotlib, etc.

## Question 1 :
Chargement de la base de données. Détecter et dénombrer le nombre de lignes ayant des valeurs manquantes.

In [24]:
# download part if needed.
url = u'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/'
filename = 'household_power_consumption'
zipfilename = filename + '.zip'
Location = url + zipfilename

# testing existence of file:

if sys.version_info >= (3, 0):
    if not(path.isfile(zipfilename)):
        urllib.request.urlretrieve(Location, zipfilename)
else:
    if not(path.isfile(zipfilename)):
        urllib.urlretrieve(Location, zipfilename)

# unzip part
zip = zipfile.ZipFile(zipfilename)
zip.extractall()

In [25]:
# Detect and count lines with missing values.
na_values = ['?', '']
fields = ['Date', 'Time', 'Global_active_power']
# Whole columns names:
# Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_metering_1;Sub_metering_2;Sub_metering_3

df = pd.read_csv(filename + '.txt', sep=';', nrows=200000,
                 na_values=na_values, usecols=fields)

print("Are all values correct number? " + str(np.all(pd.notnull(df['Global_active_power']))))

# check example mentioned on the webpage:
print(df.iloc[190500])

null_data = df[df.isnull().any(axis=1)]

print("There are " + str(df.isnull().sum().sum()) + ' total missing values')
print("There are " + str(null_data.shape[0]) + ' lines with missing values')


Are all values correct number? False
Date                   28/4/2007
Time                    00:24:00
Global_active_power          NaN
Name: 190500, dtype: object
There are 3732 total missing values
There are 3732 lines with missing values


## Question 2 :
Supprimer toutes les lignes avec des valeurs manquantes.

In [26]:
# Removing the NaN:
df = df.dropna(axis=0)
# new size:
print('The size without the NaNs is now ' + str(df.shape[0]))


The size without the NaNs is now 196268


## Question 3 :
 Utiliser ```to_datetime``` et ```set_index``` pour créer un Data Frame (on prendra garde au format des dates internationales qui diffère du format français).


In [27]:
# Using infer_datetime_format=True below helps for speed issues
time_improved = pd.to_datetime(df['Date'] + ' ' + df['Time'], dayfirst=True,
                               infer_datetime_format=True)

df['DateTime'] = time_improved
del df['Date']
del df['Time']
ts = df.set_index(['DateTime'])

In [28]:
# ts.head()
ts

Unnamed: 0_level_0,Global_active_power
DateTime,Unnamed: 1_level_1
2006-12-16 17:24:00,4.216
2006-12-16 17:25:00,5.360
2006-12-16 17:26:00,5.374
2006-12-16 17:27:00,5.388
2006-12-16 17:28:00,3.666
2006-12-16 17:29:00,3.520
2006-12-16 17:30:00,3.702
2006-12-16 17:31:00,3.700
2006-12-16 17:32:00,3.668
2006-12-16 17:33:00,3.662


## Question 4 :
Afficher le graphique des moyennes journalières entre le 1er janvier et le 30 avril 2007. Proposer une cause expliquant la consommation fin février et début avril. On pourra utiliser en plus de ```matplotlib``` le package ```seaborn``` pour améliorer le rendu visuel.

In [29]:
# # Plotting part
# fig1 = plt.figure(figsize=(,4))
# plt.plot(ts['Global_active_power'])
# plt.xticks(rotation = 60)
# plt.show()

fig2 = plt.figure(figsize=(6,4))
day_ini = '01/01/2007'
day_end = '04/30/2007'
small_ts = ts.loc[day_ini:day_end]
plt.plot(small_ts['Global_active_power'])
plt.xticks(rotation = 60)
plt.ylabel('kW')
plt.title("Minute consumption between January and April 2007")
plt.show()
# NOTE: this is too dense to visualize correctly

# grouping by day:
fig3 = plt.figure(figsize=(6,4))
day_ts = small_ts.groupby(small_ts.index.date).mean()
# BEWARE: use day_ts.tail() to check the difference with old command:
# day_ts = small_ts.groupby(pd.TimeGrouper(freq='1D')).sum()
plt.xticks(rotation = 60)
plt.ylabel('kW')
plt.title("Day averaged consumption between January and April 2007")
plt.plot(day_ts['Global_active_power'])
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Les dates avec des creux semblent coïncider avec les vacances scolaires 2007, cf. http://cache.media.education.gouv.fr/file/83/6/5836.pdf

## Question 5 : 
Charger les données avec ```pandas```, et ne garder que les colonnes ```DATE``` et ```TG```.
Diviser par 10 la colonne TG pour obtenir des températures en degrés Celsius.
Traiter les éléments de température aberrants comme des ```NaN```.

In [30]:
colnames = ['    DATE', '   TG']
my_loc = '../datasets/ECAD_2016-09-11.txt' #change according to your settings 
temp_df = pd.read_csv(my_loc, sep=',', comment='#',usecols=colnames, na_values="-9999")
# note that NAN are coding with values -9999
temp_df.columns = ['Date', 'Temperature']

# need to get the correct temperature unit
temp_df['Temperature'] = temp_df['Temperature'] / 10.

## Question 6 :
Créer un DataFrame **pandas** des températures journalières entre le 1er janvier et le 30 avril 2007. Afficher sur un même graphique ces températures et la séries *Global_active_power*.

In [31]:
# need to correct the days format
temp_df['Date'] = pd.to_datetime(temp_df['Date'], format='%Y%m%d')
temp_ts = temp_df.set_index(['Date'])
small_temp_ts = temp_ts.loc[day_ini:day_end]

In [32]:
fig, ax1 = plt.subplots()
ax1.plot(small_temp_ts, '-', color=color_blind_list[0], label='Temperature')
ax1.set_xlabel('time (s)')
# Make the y-axis label and tick labels match the line color.
ax1.set_ylabel('Temperature (°C)', color=color_blind_list[0])
for tl in ax1.get_yticklabels():
    tl.set_color(color_blind_list[0])
plt.legend()

plt.xticks(rotation=45)

ax2 = ax1.twinx()
ax2.plot(day_ts, '-', color=color_blind_list[1], label='Global active power')
ax2.set_ylabel('Global active power (kW)', color=color_blind_list[1])
for tl in ax2.get_yticklabels():
    tl.set_color(color_blind_list[1])
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

In [33]:
small_temp_ts.head()

Unnamed: 0_level_0,Temperature
Date,Unnamed: 1_level_1
2007-01-01,9.6
2007-01-02,6.0
2007-01-03,6.1
2007-01-04,9.2
2007-01-05,10.0


## Question 7 :
On considère maintenant le jeu de données ```20080421_20160927-PA13_auto.csv```.
Proposer une visualisation de la pollution pour l'ozone sur la période d'étude.

In [34]:
my_file ='../datasets/20080421_20160927-PA13_auto.csv'

# skiprows remove the first line, so is the comment='#' option
polution_df = pd.read_csv(my_file, sep=';',
                          na_values="n/d",
                          converters={'heure': str},
                          skiprows=2,
                          names=["date", "heure", "NO2", "O3"],
                         )

# check issues with 24:00 several variants can be accepted without more details on the 
# way the dataset was created.
# https://www.tutorialspoint.com/python/time_strptime.htm

polution_df['heure'] = polution_df['heure'].replace('24', '0')
time_improved = pd.to_datetime(polution_df['date'] +
                               ' ' + polution_df['heure'] + ':00',
                               format='%d/%m/%Y %H:%M')


polution_df['DateTime'] = time_improved
del polution_df['heure']
del polution_df['date']

polution_ts = polution_df.set_index(['DateTime'])
polution_ts = polution_ts.sort_index()
polution_df.head()


Unnamed: 0,NO2,O3,DateTime
0,13.0,74.0,2008-04-21 01:00:00
1,11.0,73.0,2008-04-21 02:00:00
2,13.0,64.0,2008-04-21 03:00:00
3,23.0,46.0,2008-04-21 04:00:00
4,47.0,24.0,2008-04-21 05:00:00


In [35]:
fig1 = plt.figure(figsize=(8,3))
plt.plot(polution_ts['O3'])
plt.legend()
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.title('Polution over the years in Paris')
plt.show()

<IPython.core.display.Javascript object>

## Question 8 :
 Proposer une visualisation de la pollution la plus critique par année pour l'ozone et pour pour le
dioxyde d'azote.

In [36]:
year_ts_mean = polution_ts.groupby(polution_ts.index.year).mean()
year_ts_mean.index.name = 'year'
year_ts_mean = year_ts_mean.reset_index()
year_ts_mean['year'] = pd.to_datetime(year_ts_mean['year'], format='%Y')
final_year_ts_mean = year_ts_mean.set_index(['year'])

year_ts_max = polution_ts.groupby(polution_ts.index.year).max()
year_ts_max.index.name = 'year'
year_ts_max = year_ts_max.reset_index()
year_ts_max['year'] = pd.to_datetime(year_ts_max['year'], format='%Y')
final_year_ts_max = year_ts_max.set_index(['year'])

fig3 = plt.figure(figsize=(8,3))
plt.plot(year_ts_mean['year'],year_ts_mean['O3'])
plt.title('Average polution')
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.legend()
plt.show()

fig3 = plt.figure(figsize=(8,3))
plt.plot(year_ts_max['year'],year_ts_max['O3'])
plt.title('Peak polution')
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
fig3 = plt.figure(figsize=(8,3))
plt.plot(year_ts_mean['year'],year_ts_mean['NO2'])
plt.title('Average polution')
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.legend()
plt.show()

fig3 = plt.figure(figsize=(8,3))
plt.plot(year_ts_max['year'],year_ts_max['NO2'])
plt.title('Peak polution')
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Question 9 :
Donner une représentation par mois de la pollution. Quel est le mois le plus pollué pour l'ozone, pour le dioxyde de souffre?

In [38]:
# study month-wide
month_ts = polution_ts.groupby(polution_ts.index.month).mean()
month_ts.index.name = 'month'
month_ts = month_ts.reset_index()

fig3 = plt.figure()
plt.plot(month_ts['O3'])
plt.plot(month_ts['NO2'])
plt.xticks(range(0, 12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
           'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.ylabel('Concentration ($\\mu g/m^3$)')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

Conclusion: pour l'ozone le mois le pire est le mois de mai et pour le dioxyde de soufre le mois janvier.