# Air quality analysis in Barcelona

## 1. Data import and cleaning

In [2]:
## import libraries

import pandas as pd 
from sqlalchemy import engine
import pymysql


In [3]:
raw_data = pd.read_csv('air_quality_Nov2017.csv')

In [4]:
raw_data.head()

Unnamed: 0,Station,Air Quality,Longitude,Latitude,O3 Hour,O3 Quality,O3 Value,NO2 Hour,NO2 Quality,NO2 Value,PM10 Hour,PM10 Quality,PM10 Value,Generated,Date Time
0,Barcelona - Sants,Good,2.1331,41.3788,,,,0h,Good,84.0,,,,01/11/2018 0:00,1541027104
1,Barcelona - Eixample,Moderate,2.1538,41.3853,0h,Good,1.0,0h,Moderate,113.0,0h,Good,36.0,01/11/2018 0:00,1541027104
2,Barcelona - Gràcia,Good,2.1534,41.3987,0h,Good,10.0,0h,Good,73.0,,,,01/11/2018 0:00,1541027104
3,Barcelona - Ciutadella,Good,2.1874,41.3864,0h,Good,2.0,0h,Good,86.0,,,,01/11/2018 0:00,1541027104
4,Barcelona - Vall Hebron,Good,2.148,41.4261,0h,Good,7.0,0h,Good,69.0,,,,01/11/2018 0:00,1541027104


We remove the columns that are not needed for our analysis. In this case, the columns that are needed are: 
- Station
- Air Quality 
- O3 Quality
- 03 Value
- NO2 Quality
- NO2 Value
- PM10 Quality
- PM10 Value 
- Generated 
- Date Time 

In [5]:
print(raw_data.columns.tolist())

['Station', 'Air Quality', 'Longitude', 'Latitude', 'O3 Hour', 'O3 Quality', 'O3 Value', 'NO2 Hour', 'NO2 Quality', 'NO2 Value', 'PM10 Hour', 'PM10 Quality', 'PM10 Value', 'Generated', 'Date Time']


In [6]:
raw_data_selected = raw_data.copy()

raw_data_selected = raw_data_selected[['Station', 'Air Quality', 'O3 Quality', 'O3 Value', 'NO2 Quality', 'NO2 Value', 'PM10 Quality', 'PM10 Value', 'Generated']]

raw_data_selected.head()

Unnamed: 0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Generated
0,Barcelona - Sants,Good,,,Good,84.0,,,01/11/2018 0:00
1,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0,01/11/2018 0:00
2,Barcelona - Gràcia,Good,Good,10.0,Good,73.0,,,01/11/2018 0:00
3,Barcelona - Ciutadella,Good,Good,2.0,Good,86.0,,,01/11/2018 0:00
4,Barcelona - Vall Hebron,Good,Good,7.0,Good,69.0,,,01/11/2018 0:00


### Analyzing the type of each attribute of the dataset

In [7]:

raw_data_selected.dtypes

Station          object
Air Quality      object
O3 Quality       object
O3 Value        float64
NO2 Quality      object
NO2 Value       float64
PM10 Quality     object
PM10 Value      float64
Generated        object
dtype: object

### Changing type of date to DateTime

In [8]:
raw_data_selected['date_corrected'] = pd.to_datetime(raw_data_selected["Generated"])

raw_data_selected.set_index("date_corrected", inplace=True) # Inplace reassign dataset to indexed dataset. Is as if dataset_NEW=dataset_OLD

raw_data_selected = raw_data_selected.drop(["Generated"], axis=1)

In [9]:
raw_data_selected.index

DatetimeIndex(['2018-01-11 00:00:00', '2018-01-11 00:00:00',
               '2018-01-11 00:00:00', '2018-01-11 00:00:00',
               '2018-01-11 00:00:00', '2018-01-11 00:00:00',
               '2018-01-11 00:00:00', '2018-01-11 00:00:00',
               '2018-01-11 01:00:00', '2018-01-11 01:00:00',
               ...
               '2018-11-30 22:00:00', '2018-11-30 22:00:00',
               '2018-11-30 23:00:00', '2018-11-30 23:00:00',
               '2018-11-30 23:00:00', '2018-11-30 23:00:00',
               '2018-11-30 23:00:00', '2018-11-30 23:00:00',
               '2018-11-30 23:00:00', '2018-11-30 23:00:00'],
              dtype='datetime64[ns]', name='date_corrected', length=5744, freq=None)

In [10]:
raw_data_selected

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Barcelona - Sants,Good,,,Good,84.0,,
2018-01-11 00:00:00,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0
2018-01-11 00:00:00,Barcelona - Gràcia,Good,Good,10.0,Good,73.0,,
2018-01-11 00:00:00,Barcelona - Ciutadella,Good,Good,2.0,Good,86.0,,
2018-01-11 00:00:00,Barcelona - Vall Hebron,Good,Good,7.0,Good,69.0,,
2018-01-11 00:00:00,Barcelona - Palau Reial,Good,Good,11.0,Good,57.0,Good,23.0
2018-01-11 00:00:00,Barcelona - Poblenou,Good,,,Good,86.0,Good,32.0
2018-01-11 00:00:00,Barcelona - Observ Fabra,Good,Good,58.0,Good,3.0,Good,25.0
2018-01-11 01:00:00,Barcelona - Sants,Good,,,Good,62.0,,
2018-01-11 01:00:00,Barcelona - Eixample,Good,Good,6.0,Good,80.0,Good,35.0


### General description in Barcelona

In [11]:
raw_data_selected.describe()

Unnamed: 0,O3 Value,NO2 Value,PM10 Value
count,4101.0,5460.0,3647.0
mean,34.082907,35.740293,16.590074
std,22.960687,22.357262,8.065424
min,1.0,1.0,2.0
25%,14.0,17.0,10.0
50%,34.0,33.0,15.0
75%,52.0,52.0,22.0
max,100.0,117.0,44.0


From this table we can say that we have missing values in the general table. However, a general description of Barcelona can be done. The average values for November 2018 in Barcelona are: 
- O3: 34 ug/cm3
- NO2: 35.74 ug/cm3
- PM10: 16.59 ug/cm3

In average terms, Barcelona had a very low qualitative name for O3, very low for NO2 and low for PM10. But, what is the situation in Barcelona? 

### Where has the maximum value of pollution happened in Barcelona? 

In [12]:
# For O3

raw_data_selected[raw_data_selected['O3 Value']==raw_data_selected['O3 Value'].max()]

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-11-29 22:00:00,Barcelona - Observ Fabra,Good,Good,100.0,Good,4.0,Good,13.0


In [13]:
# For NO2

raw_data_selected[raw_data_selected['NO2 Value']==raw_data_selected['NO2 Value'].max()]

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-11-22 11:00:00,Barcelona - Gràcia,Moderate,Good,3.0,Moderate,117.0,Good,12.0


In [14]:
# For PM10
raw_data_selected[raw_data_selected['NO2 Value']==raw_data_selected['NO2 Value'].max()]

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-11-22 11:00:00,Barcelona - Gràcia,Moderate,Good,3.0,Moderate,117.0,Good,12.0


### Missing values

There are several ways to deal with missing values, in this case NaN values.
First of all, we need to know how many NaN values we have in our dataset. 

In [15]:
raw_data_selected.isna().sum()

Station            0
Air Quality        0
O3 Quality      1476
O3 Value        1643
NO2 Quality       55
NO2 Value        284
PM10 Quality    2022
PM10 Value      2097
dtype: int64

#### Method 1: dropping all the NaN values of the entire dataset. 

In this case we have removed all the rows that have a "NaN" value there. If we perform an analysis of what we have done, we can see the total amount of data we have lost 

In [16]:
before_rows = raw_data_selected.shape[0]
print(f"Number of rows before dropping all the NaN values: {before_rows}")

Number of rows before dropping all the NaN values: 5744


In [17]:
clean_air_quality =  raw_data_selected.copy()
clean_air_quality = raw_data_selected.dropna()
clean_air_quality.head()

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0
2018-01-11 00:00:00,Barcelona - Palau Reial,Good,Good,11.0,Good,57.0,Good,23.0
2018-01-11 00:00:00,Barcelona - Observ Fabra,Good,Good,58.0,Good,3.0,Good,25.0
2018-01-11 01:00:00,Barcelona - Eixample,Good,Good,6.0,Good,80.0,Good,35.0
2018-01-11 01:00:00,Barcelona - Palau Reial,Good,Good,27.0,Good,38.0,Good,24.0


In [18]:
after_rows = clean_air_quality.shape[0]
print (f"Number of rows after dropping all the NaN values: {after_rows}")

Number of rows after dropping all the NaN values: 2853


In [19]:
data_lost_perc = ((before_rows - after_rows)/before_rows)*100
print(f"The percentage of data that is NaN and has been removed is {data_lost_perc}")


The percentage of data that is NaN and has been removed is 50.330779944289695


If we do it like this, we remove half of the data. 

#### Method 2: Instead of NaN removal, values interpolation 

#### Method 3: Instead of NaN removal, choose the last value of each station 

## Database labeling 

In [20]:
labels = {"Barcelona - Ciutadella":1, "Barcelona - Eixample":2, "Barcelona - Gràcia":3, "Barcelona - Palau Reial":4, 
         "Barcelona - Poblenou": 5, "Barcelona - Sants": 6, "Barcelona - Vall Hebron": 7, 
         "Barcelona - Observ Fabra":8  }

In [21]:
raw_data_selected.head()

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11,Barcelona - Sants,Good,,,Good,84.0,,
2018-01-11,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0
2018-01-11,Barcelona - Gràcia,Good,Good,10.0,Good,73.0,,
2018-01-11,Barcelona - Ciutadella,Good,Good,2.0,Good,86.0,,
2018-01-11,Barcelona - Vall Hebron,Good,Good,7.0,Good,69.0,,


In [22]:
station_ID=[labels[i] for i in raw_data_selected["Station"]]

raw_data_selected["Station_ID"] = station_ID

raw_data_selected.head()

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-11,Barcelona - Sants,Good,,,Good,84.0,,,6
2018-01-11,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0,2
2018-01-11,Barcelona - Gràcia,Good,Good,10.0,Good,73.0,,,3
2018-01-11,Barcelona - Ciutadella,Good,Good,2.0,Good,86.0,,,1
2018-01-11,Barcelona - Vall Hebron,Good,Good,7.0,Good,69.0,,,7


In [23]:
raw_data_selected = raw_data_selected.drop("Station", axis = 1)

In [24]:
raw_data_selected.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11,Good,,,Good,84.0,,,6
2018-01-11,Moderate,Good,1.0,Moderate,113.0,Good,36.0,2
2018-01-11,Good,Good,10.0,Good,73.0,,,3
2018-01-11,Good,Good,2.0,Good,86.0,,,1
2018-01-11,Good,Good,7.0,Good,69.0,,,7


In [25]:
news_columns = ["Station_ID", 'Air Quality', 'O3 Quality', 'O3 Value', 'NO2 Quality', 'NO2 Value', 'PM10 Quality', 'PM10 Value']

In [26]:
Barcelona_air_quality = raw_data_selected.reindex(columns=news_columns)

Barcelona_air_quality.head()

Unnamed: 0_level_0,Station_ID,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11,6,Good,,,Good,84.0,,
2018-01-11,2,Moderate,Good,1.0,Moderate,113.0,Good,36.0
2018-01-11,3,Good,Good,10.0,Good,73.0,,
2018-01-11,1,Good,Good,2.0,Good,86.0,,
2018-01-11,7,Good,Good,7.0,Good,69.0,,


### Secondary table for Station ID

In [27]:
station_info = pd.read_csv('air_stations_Nov2017.csv')
station_info

Unnamed: 0,Station,Longitude,Latitude,Ubication,District Name,Neighborhood Name
0,Barcelona - Ciutadella,2.1874,41.3864,Parc de la Ciutadella,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera"
1,Barcelona - Eixample,2.1538,41.3853,Av. Roma - c/ Comte Urgell,Eixample,la Nova Esquerra de l'Eixample
2,Barcelona - Gràcia,2.1534,41.3987,Plaça Gal·la Placídia (Via Augusta - Travesser...,Gracia,la Vila de Gracia
3,Barcelona - Palau Reial,2.1151,41.3875,c/ John Maynard Keynes - c/ de Jordi Girona,Les Corts,Pedralbes
4,Barcelona - Poblenou,2.2045,41.4039,Plaça Josep Trueta (Pujades - Lope de Vega),Sant Marti,el Poblenou
5,Barcelona - Sants,2.1331,41.3788,Jardins de Can Mantega (Joan Güell - Violant d...,Sants-Montjuic,Sants
6,Barcelona - Vall Hebron,2.148,41.4261,Parc de la Vall d’Hebron - c/ Martí Codolar - ...,Horta-Guinardo,la Vall d'Hebron
7,Barcelona – Observ Fabra,2.1211,41.4176,"Ctra Observatori Fabra, 27",Sarrià-Sant Gervasi,"Vallvidrera, el Tibidabo i les Planes"


In [28]:
station_ID_num= [1,2,3,4,5,6,7,8]

station_info['station_ID']= station_ID_num
station_info

Unnamed: 0,Station,Longitude,Latitude,Ubication,District Name,Neighborhood Name,station_ID
0,Barcelona - Ciutadella,2.1874,41.3864,Parc de la Ciutadella,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1
1,Barcelona - Eixample,2.1538,41.3853,Av. Roma - c/ Comte Urgell,Eixample,la Nova Esquerra de l'Eixample,2
2,Barcelona - Gràcia,2.1534,41.3987,Plaça Gal·la Placídia (Via Augusta - Travesser...,Gracia,la Vila de Gracia,3
3,Barcelona - Palau Reial,2.1151,41.3875,c/ John Maynard Keynes - c/ de Jordi Girona,Les Corts,Pedralbes,4
4,Barcelona - Poblenou,2.2045,41.4039,Plaça Josep Trueta (Pujades - Lope de Vega),Sant Marti,el Poblenou,5
5,Barcelona - Sants,2.1331,41.3788,Jardins de Can Mantega (Joan Güell - Violant d...,Sants-Montjuic,Sants,6
6,Barcelona - Vall Hebron,2.148,41.4261,Parc de la Vall d’Hebron - c/ Martí Codolar - ...,Horta-Guinardo,la Vall d'Hebron,7
7,Barcelona – Observ Fabra,2.1211,41.4176,"Ctra Observatori Fabra, 27",Sarrià-Sant Gervasi,"Vallvidrera, el Tibidabo i les Planes",8


In [29]:
station_info = station_info.set_index("station_ID")
station_info

Unnamed: 0_level_0,Station,Longitude,Latitude,Ubication,District Name,Neighborhood Name
station_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Barcelona - Ciutadella,2.1874,41.3864,Parc de la Ciutadella,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera"
2,Barcelona - Eixample,2.1538,41.3853,Av. Roma - c/ Comte Urgell,Eixample,la Nova Esquerra de l'Eixample
3,Barcelona - Gràcia,2.1534,41.3987,Plaça Gal·la Placídia (Via Augusta - Travesser...,Gracia,la Vila de Gracia
4,Barcelona - Palau Reial,2.1151,41.3875,c/ John Maynard Keynes - c/ de Jordi Girona,Les Corts,Pedralbes
5,Barcelona - Poblenou,2.2045,41.4039,Plaça Josep Trueta (Pujades - Lope de Vega),Sant Marti,el Poblenou
6,Barcelona - Sants,2.1331,41.3788,Jardins de Can Mantega (Joan Güell - Violant d...,Sants-Montjuic,Sants
7,Barcelona - Vall Hebron,2.148,41.4261,Parc de la Vall d’Hebron - c/ Martí Codolar - ...,Horta-Guinardo,la Vall d'Hebron
8,Barcelona – Observ Fabra,2.1211,41.4176,"Ctra Observatori Fabra, 27",Sarrià-Sant Gervasi,"Vallvidrera, el Tibidabo i les Planes"


## 2. Database connection

In [30]:
## Import libraries
from sqlalchemy import create_engine

In [31]:
# Workbench Databench 
driver = 'mysql+pymysql'
user = 'owner'
password = 'ir0n-h4ck'
ip = '104.197.101.244'
database = 'Pollution'

In [32]:
# connection_string to connect to Workbench Database 
connection_string = f'{driver}://{user}:{password}@{ip}/{database}'

In [33]:
# Engine creation 
engine = create_engine(connection_string)

ModuleNotFoundError: No module named 'pymysql'

In [None]:
#uploading station_info dataset into mySQL Database 
station_info.to_sql('station_info', engine)

In [None]:
#uploading raw_data_selected dataset into mySQL Database 
raw_data_selected.to_sql("raw_data_selected", engine)

## Data splitting per station

Stations we have in Barcelona: 
1. Barcelona - Ciutadella 
2. Barcelona - Eixample
3. Barcelona - Gràcia 
4. Barcelona - Palau Reial
5. Barcelona - Poblenou
6. Barcelona - Sants
7. Barcelona - Vall Hebron
8. Barcelona - Vallvidrera, el Tibidabo i les Planes

#### BCN- Ciutadella 

In [34]:
BCN_Ciutadella = clean_air_quality[clean_air_quality["Station"]=="Barcelona - Ciutadella"]

BCN_Ciutadella.head()

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


#### BCN-Eixample

In [35]:
#BCN_Sants = clean_air_quality[clean_air_quality["Station"] == "Barcelona - Sants"]

#BCN_Sants

#BCN_Eixample = 

#clean_air_quality.head()

BCN_Eixample = clean_air_quality[clean_air_quality["Station"]=="Barcelona - Eixample"]

BCN_Eixample.head()

Unnamed: 0_level_0,Station,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Barcelona - Eixample,Moderate,Good,1.0,Moderate,113.0,Good,36.0
2018-01-11 01:00:00,Barcelona - Eixample,Good,Good,6.0,Good,80.0,Good,35.0
2018-01-11 02:00:00,Barcelona - Eixample,Good,Good,14.0,Good,64.0,Good,34.0
2018-01-11 03:00:00,Barcelona - Eixample,Good,Good,32.0,Good,44.0,Good,35.0
2018-01-11 04:00:00,Barcelona - Eixample,Good,Good,43.0,Good,34.0,Good,34.0


In order to solve the problem and not lose any information when we remove NaN values, we will work with the raw_data_selected

### Station_1 : Barcelona - Ciutadella 

In [36]:
BCN_Ciutadella = raw_data_selected[raw_data_selected["Station_ID"]==1]

BCN_Ciutadella.head()


Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Good,Good,2.0,Good,86.0,,,1
2018-01-11 01:00:00,Good,Good,4.0,Good,78.0,,,1
2018-01-11 02:00:00,Good,Good,24.0,Good,56.0,,,1
2018-01-11 03:00:00,Good,Good,45.0,Good,29.0,,,1
2018-01-11 04:00:00,Good,Good,61.0,Good,19.0,,,1


In [37]:
BCN_Ciutadella.shape

(718, 8)

In [38]:
BCN_Ciutadella.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 8 columns):
Air Quality     718 non-null object
O3 Quality      714 non-null object
O3 Value        701 non-null float64
NO2 Quality     714 non-null object
NO2 Value       701 non-null float64
PM10 Quality    0 non-null object
PM10 Value      0 non-null float64
Station_ID      718 non-null int64
dtypes: float64(3), int64(1), object(4)
memory usage: 50.5+ KB


In [39]:
BCN_Ciutadella_cleaned = BCN_Ciutadella.drop(["PM10 Quality", "PM10 Value"], axis=1)

BCN_Ciutadella_cleaned.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-11 00:00:00,Good,Good,2.0,Good,86.0,1
2018-01-11 01:00:00,Good,Good,4.0,Good,78.0,1
2018-01-11 02:00:00,Good,Good,24.0,Good,56.0,1
2018-01-11 03:00:00,Good,Good,45.0,Good,29.0,1
2018-01-11 04:00:00,Good,Good,61.0,Good,19.0,1


In [40]:
BCN_Ciutadella_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 6 columns):
Air Quality    718 non-null object
O3 Quality     714 non-null object
O3 Value       701 non-null float64
NO2 Quality    714 non-null object
NO2 Value      701 non-null float64
Station_ID     718 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 39.3+ KB


In this case, if we remove the entire rows where NaN are located, we only lose approximately 2,7% of the data. 

In [41]:
BCN_Ciutadella_cleaned = BCN_Ciutadella_cleaned.dropna()

In [42]:
BCN_Ciutadella_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 701 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 6 columns):
Air Quality    701 non-null object
O3 Quality     701 non-null object
O3 Value       701 non-null float64
NO2 Quality    701 non-null object
NO2 Value      701 non-null float64
Station_ID     701 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 38.3+ KB


In [43]:
BCN_Ciutadella_cleaned.describe()

Unnamed: 0,O3 Value,NO2 Value,Station_ID
count,701.0,701.0,701.0
mean,25.128388,39.85592,1.0
std,19.75234,18.106212,0.0
min,1.0,5.0,1.0
25%,6.0,25.0,1.0
50%,22.0,40.0,1.0
75%,41.0,53.0,1.0
max,85.0,88.0,1.0


### Station_2 : Barcelona - Eixample 

In [44]:
BCN_Eixample = raw_data_selected[raw_data_selected["Station_ID"]==2]

BCN_Eixample.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Moderate,Good,1.0,Moderate,113.0,Good,36.0,2
2018-01-11 01:00:00,Good,Good,6.0,Good,80.0,Good,35.0,2
2018-01-11 02:00:00,Good,Good,14.0,Good,64.0,Good,34.0,2
2018-01-11 03:00:00,Good,Good,32.0,Good,44.0,Good,35.0,2
2018-01-11 04:00:00,Good,Good,43.0,Good,34.0,Good,34.0,2


In [46]:
BCN_Eixample.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 8 columns):
Air Quality     718 non-null object
O3 Quality      711 non-null object
O3 Value        645 non-null float64
NO2 Quality     711 non-null object
NO2 Value       644 non-null float64
PM10 Quality    718 non-null object
PM10 Value      718 non-null float64
Station_ID      718 non-null int64
dtypes: float64(3), int64(1), object(4)
memory usage: 50.5+ KB


In [50]:
BCN_Eixample_cleaned = BCN_Eixample.dropna()
BCN_Eixample_cleaned.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Moderate,Good,1.0,Moderate,113.0,Good,36.0,2
2018-01-11 01:00:00,Good,Good,6.0,Good,80.0,Good,35.0,2
2018-01-11 02:00:00,Good,Good,14.0,Good,64.0,Good,34.0,2
2018-01-11 03:00:00,Good,Good,32.0,Good,44.0,Good,35.0,2
2018-01-11 04:00:00,Good,Good,43.0,Good,34.0,Good,34.0,2


In [51]:
BCN_Eixample_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 644 entries, 2018-01-11 00:00:00 to 2018-11-30 21:00:00
Data columns (total 8 columns):
Air Quality     644 non-null object
O3 Quality      644 non-null object
O3 Value        644 non-null float64
NO2 Quality     644 non-null object
NO2 Value       644 non-null float64
PM10 Quality    644 non-null object
PM10 Value      644 non-null float64
Station_ID      644 non-null int64
dtypes: float64(3), int64(1), object(4)
memory usage: 45.3+ KB


In [52]:
BCN_Eixample_cleaned.describe()

Unnamed: 0,O3 Value,NO2 Value,PM10 Value,Station_ID
count,644.0,644.0,644.0,644.0
mean,17.015528,56.240683,23.02795,2.0
std,13.911062,20.171647,8.534943,0.0
min,1.0,17.0,11.0,2.0
25%,4.0,41.0,16.0,2.0
50%,14.0,54.0,22.0,2.0
75%,27.25,71.0,29.0,2.0
max,61.0,113.0,44.0,2.0


### Station_3 : Barcelona - Gracia

In [60]:
BCN_Gracia

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Good,Good,10.0,Good,73.0,,,3
2018-01-11 01:00:00,Good,Good,21.0,Good,59.0,,,3
2018-01-11 02:00:00,Good,Good,32.0,Good,41.0,,,3
2018-01-11 03:00:00,Good,Good,48.0,Good,27.0,,,3
2018-01-11 04:00:00,Good,Good,58.0,Good,18.0,,,3
2018-01-11 05:00:00,Good,Good,58.0,Good,16.0,,,3
2018-01-11 06:00:00,Good,Good,51.0,Good,19.0,,,3
2018-01-11 07:00:00,Good,Good,45.0,Good,19.0,,,3
2018-01-11 08:00:00,Good,Good,39.0,Good,21.0,,,3
2018-01-11 09:00:00,Good,Good,17.0,Good,43.0,,,3


In [53]:
BCN_Gracia = raw_data_selected[raw_data_selected["Station_ID"]==3]

BCN_Gracia.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-11 00:00:00,Good,Good,10.0,Good,73.0,,,3
2018-01-11 01:00:00,Good,Good,21.0,Good,59.0,,,3
2018-01-11 02:00:00,Good,Good,32.0,Good,41.0,,,3
2018-01-11 03:00:00,Good,Good,48.0,Good,27.0,,,3
2018-01-11 04:00:00,Good,Good,58.0,Good,18.0,,,3


In [54]:
BCN_Gracia.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 8 columns):
Air Quality     718 non-null object
O3 Quality      707 non-null object
O3 Value        644 non-null float64
NO2 Quality     707 non-null object
NO2 Value       644 non-null float64
PM10 Quality    211 non-null object
PM10 Value      211 non-null float64
Station_ID      718 non-null int64
dtypes: float64(3), int64(1), object(4)
memory usage: 50.5+ KB


In this case, if we remove the entire columns for 'PM10', as otherwise we loose more than 70% of the data.

In [55]:
BCN_Gracia_cleaned = BCN_Gracia.drop(["PM10 Quality", "PM10 Value"], axis=1)

BCN_Gracia_cleaned.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-11 00:00:00,Good,Good,10.0,Good,73.0,3
2018-01-11 01:00:00,Good,Good,21.0,Good,59.0,3
2018-01-11 02:00:00,Good,Good,32.0,Good,41.0,3
2018-01-11 03:00:00,Good,Good,48.0,Good,27.0,3
2018-01-11 04:00:00,Good,Good,58.0,Good,18.0,3


In [56]:
BCN_Gracia_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2018-01-11 00:00:00 to 2018-11-30 23:00:00
Data columns (total 6 columns):
Air Quality    718 non-null object
O3 Quality     707 non-null object
O3 Value       644 non-null float64
NO2 Quality    707 non-null object
NO2 Value      644 non-null float64
Station_ID     718 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 39.3+ KB


In [57]:
BCN_Gracia_cleaned = BCN_Gracia.dropna()
BCN_Gracia_cleaned.head()

Unnamed: 0_level_0,Air Quality,O3 Quality,O3 Value,NO2 Quality,NO2 Value,PM10 Quality,PM10 Value,Station_ID
date_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-11-22 05:00:00,Good,Good,15.0,Good,50.0,Good,7.0,3
2018-11-22 06:00:00,Good,Good,6.0,Good,53.0,Good,7.0,3
2018-11-22 07:00:00,Good,Good,1.0,Good,55.0,Good,8.0,3
2018-11-22 08:00:00,Good,Good,1.0,Good,74.0,Good,8.0,3
2018-11-22 09:00:00,Moderate,Good,2.0,Moderate,99.0,Good,9.0,3


In [58]:
BCN_Gracia_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 203 entries, 2018-11-22 05:00:00 to 2018-11-30 23:00:00
Data columns (total 8 columns):
Air Quality     203 non-null object
O3 Quality      203 non-null object
O3 Value        203 non-null float64
NO2 Quality     203 non-null object
NO2 Value       203 non-null float64
PM10 Quality    203 non-null object
PM10 Value      203 non-null float64
Station_ID      203 non-null int64
dtypes: float64(3), int64(1), object(4)
memory usage: 14.3+ KB


In [61]:
BCN_Gracia_cleaned.describe()

Unnamed: 0,O3 Value,NO2 Value,PM10 Value,Station_ID
count,203.0,203.0,203.0,203.0
mean,17.147783,52.339901,16.807882,3.0
std,13.120079,23.572465,6.035172,0.0
min,1.0,11.0,7.0,3.0
25%,4.0,34.0,12.0,3.0
50%,15.0,49.0,17.0,3.0
75%,27.0,68.5,21.0,3.0
max,49.0,117.0,31.0,3.0


In [None]:
Barcelona - Ciutadella
Barcelona - Eixample
Barcelona - Gràcia
Barcelona - Palau Reial
Barcelona - Poblenou
Barcelona - Sants
Barcelona - Vall Hebron
Barcelona - Vallvidrera, el Tibidabo i les Planes