# <h1> Particulate Matter 2.5 in DMV area during COVID-19 Pandemic</h1>
# Capston Project



### Goal: To investigate Particulate Matter 2.5 concentrations and contributing factors during the COVID-19 pandemic and predict the trends over various spans of time proceeding the pandemic.

### 1. Step one: Defining the question

       -   What is the PM2.5 in DC Metro areas?
       -   What are contribtion factors? 
       -   What is the trend during the Pandemic  


In [310]:
import pandas as pd
import numpy as np
import hvplot.pandas 
import bokeh.plotting
import holoviews as hv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn import svm
from sklearn.metrics import accuracy_score

## Uploading data 

In [311]:
dfc=pd.read_csv('co_daily_3yr.csv')
dfc.head(5)

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Parameter Name,Sample Duration,Pollutant Standard,Date Local,Units of Measure,...,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Local Site Name,State Name,County Name,City Name,CBSA Name,Date of Last Change
0,24,5,3001,39.310833,-76.474444,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/1/2019,Parts per million,...,0.2415,0.248,1,,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
1,24,5,3001,39.310833,-76.474444,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/8/2019,Parts per million,...,0.29481,0.617,7,,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
2,24,5,3001,39.310833,-76.474444,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/9/2019,Parts per million,...,0.15487,0.195,0,,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
3,24,5,3001,39.310833,-76.474444,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/10/2019,Parts per million,...,0.146875,0.174,7,,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
4,24,5,3001,39.310833,-76.474444,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/11/2019,Parts per million,...,0.28525,1.263,23,,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020


### Data Pre-processing 
#### Identify and  Handling Missing value 
#### Data Formatting
#### Data Normalization 
#### Data Binning

In [312]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9026 entries, 0 to 9025
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   State Code           9026 non-null   int64  
 1   County Code          9026 non-null   int64  
 2   Site Num             9026 non-null   int64  
 3   Latitude             9026 non-null   float64
 4   Longitude            9026 non-null   float64
 5   Parameter Name       9026 non-null   object 
 6   Sample Duration      9026 non-null   object 
 7   Pollutant Standard   9026 non-null   object 
 8   Date Local           9026 non-null   object 
 9   Units of Measure     9026 non-null   object 
 10  Observation Count    9026 non-null   int64  
 11  Observation Percent  9026 non-null   int64  
 12  Arithmetic Mean      9026 non-null   float64
 13  1st Max Value        9026 non-null   float64
 14  1st Max Hour         9026 non-null   int64  
 15  AQI                  4514 non-null   f

In [313]:
dfc.shape

(9026, 22)

In [314]:
# Check the missing value
dfc.isnull()

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Parameter Name,Sample Duration,Pollutant Standard,Date Local,Units of Measure,...,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Local Site Name,State Name,County Name,City Name,CBSA Name,Date of Last Change
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9021,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9022,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9023,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9024,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [315]:
dfc.columns

Index(['State Code', 'County Code', 'Site Num', 'Latitude', 'Longitude',
       'Parameter Name', 'Sample Duration', 'Pollutant Standard', 'Date Local',
       'Units of Measure', 'Observation Count', 'Observation Percent',
       'Arithmetic Mean', '1st Max Value', '1st Max Hour', 'AQI',
       'Local Site Name', 'State Name', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change'],
      dtype='object')

In [316]:
dfc.count()

State Code             9026
County Code            9026
Site Num               9026
Latitude               9026
Longitude              9026
Parameter Name         9026
Sample Duration        9026
Pollutant Standard     9026
Date Local             9026
Units of Measure       9026
Observation Count      9026
Observation Percent    9026
Arithmetic Mean        9026
1st Max Value          9026
1st Max Hour           9026
AQI                    4514
Local Site Name        9026
State Name             9026
County Name            9026
City Name              9026
CBSA Name              8400
Date of Last Change    9026
dtype: int64

In [317]:
dfc.dropna()

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Parameter Name,Sample Duration,Pollutant Standard,Date Local,Units of Measure,...,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Local Site Name,State Name,County Name,City Name,CBSA Name,Date of Last Change
348,24,5,3001,39.310833,-76.474444,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,1/8/2019,Parts per million,...,0.293750,0.4,13,5.0,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
349,24,5,3001,39.310833,-76.474444,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,1/9/2019,Parts per million,...,0.204167,0.3,0,3.0,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
350,24,5,3001,39.310833,-76.474444,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,1/10/2019,Parts per million,...,0.150000,0.2,11,2.0,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
351,24,5,3001,39.310833,-76.474444,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,1/11/2019,Parts per million,...,0.195833,0.5,23,6.0,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
352,24,5,3001,39.310833,-76.474444,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,1/12/2019,Parts per million,...,0.450000,0.8,1,9.0,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9021,11,1,51,38.894770,-76.953426,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,12/27/2019,Parts per million,...,0.458333,1.1,23,13.0,Near Road,District Of Columbia,District of Columbia,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",2/7/2020
9022,11,1,51,38.894770,-76.953426,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,12/28/2019,Parts per million,...,0.979167,1.4,3,16.0,Near Road,District Of Columbia,District of Columbia,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",2/7/2020
9023,11,1,51,38.894770,-76.953426,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,12/29/2019,Parts per million,...,0.487500,1.0,0,11.0,Near Road,District Of Columbia,District of Columbia,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",2/7/2020
9024,11,1,51,38.894770,-76.953426,Carbon monoxide,8-HR RUN AVG END HOUR,CO 8-hour 1971,12/30/2019,Parts per million,...,0.283333,0.3,0,3.0,Near Road,District Of Columbia,District of Columbia,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",2/7/2020


In [318]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9026 entries, 0 to 9025
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   State Code           9026 non-null   int64  
 1   County Code          9026 non-null   int64  
 2   Site Num             9026 non-null   int64  
 3   Latitude             9026 non-null   float64
 4   Longitude            9026 non-null   float64
 5   Parameter Name       9026 non-null   object 
 6   Sample Duration      9026 non-null   object 
 7   Pollutant Standard   9026 non-null   object 
 8   Date Local           9026 non-null   object 
 9   Units of Measure     9026 non-null   object 
 10  Observation Count    9026 non-null   int64  
 11  Observation Percent  9026 non-null   int64  
 12  Arithmetic Mean      9026 non-null   float64
 13  1st Max Value        9026 non-null   float64
 14  1st Max Hour         9026 non-null   int64  
 15  AQI                  4514 non-null   f

In [319]:
# Data Exploration 

In [320]:
dfc.hvplot()

In [321]:
# redefining dataframe

In [322]:
# Editing Column Names

In [323]:
dfc = dfc.rename(columns={'Parameter Name':'Parameter_Name','Sample Duration':'Sample_Duration','Pollutant Standard':'Pollutant_Standard',
                   'Date Local':'Date_Local','1st Max Value':'1st_Max_Value','Observation Count':'Observation_Count', '1st Max Hour':'1st_ Max_ Hour', 'Local_Site_Name':'Local_Site_Name', 'State Name':'State_Name', 'County Name':'County_Name','City Name':'City_Name',
                        'CBSA Name':'CBSA_Name','Date of Last Change':'Date_ of_Last_Change'})

In [324]:
dfc.columns

Index(['State Code', 'County Code', 'Site Num', 'Latitude', 'Longitude',
       'Parameter_Name', 'Sample_Duration', 'Pollutant_Standard', 'Date_Local',
       'Units of Measure', 'Observation_Count', 'Observation Percent',
       'Arithmetic Mean', '1st_Max_Value', '1st_ Max_ Hour', 'AQI',
       'Local Site Name', 'State_Name', 'County_Name', 'City_Name',
       'CBSA_Name', 'Date_ of_Last_Change'],
      dtype='object')

In [325]:
dfc.columns

Index(['State Code', 'County Code', 'Site Num', 'Latitude', 'Longitude',
       'Parameter_Name', 'Sample_Duration', 'Pollutant_Standard', 'Date_Local',
       'Units of Measure', 'Observation_Count', 'Observation Percent',
       'Arithmetic Mean', '1st_Max_Value', '1st_ Max_ Hour', 'AQI',
       'Local Site Name', 'State_Name', 'County_Name', 'City_Name',
       'CBSA_Name', 'Date_ of_Last_Change'],
      dtype='object')

In [326]:
dfc1 = dfc.groupby('State_Name').Pollutant_Standard

In [327]:
dfc1.head(5)

0       CO 1-hour 1971
1       CO 1-hour 1971
2       CO 1-hour 1971
3       CO 1-hour 1971
4       CO 1-hour 1971
2643    CO 1-hour 1971
2644    CO 1-hour 1971
2645    CO 1-hour 1971
2646    CO 1-hour 1971
2647    CO 1-hour 1971
7568    CO 1-hour 1971
7569    CO 1-hour 1971
7570    CO 1-hour 1971
7571    CO 1-hour 1971
7572    CO 1-hour 1971
Name: Pollutant_Standard, dtype: object

In [328]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)

In [329]:
# Extract categorial value 

In [330]:
data=dfc[['County_Name','City_Name']]
data.head()

Unnamed: 0,County_Name,City_Name
0,Baltimore,Essex
1,Baltimore,Essex
2,Baltimore,Essex
3,Baltimore,Essex
4,Baltimore,Essex


In [331]:
categorical= [var for var in dfc.columns if dfc[var].dtypes=='O']              

In [332]:
numerical = [var for var in dfc.columns if dfc[var].dtypes!= 'O'] 

In [333]:
print('categorical variables-{}'.format (len(categorical)))

categorical variables-11


In [334]:
print(' numerical-{} '.format (len(numerical)))

 numerical-11 


In [335]:
numerical

['State Code',
 'County Code',
 'Site Num',
 'Latitude',
 'Longitude',
 'Observation_Count',
 'Observation Percent',
 'Arithmetic Mean',
 '1st_Max_Value',
 '1st_ Max_ Hour',
 'AQI']

In [336]:
categorical

['Parameter_Name',
 'Sample_Duration',
 'Pollutant_Standard',
 'Date_Local',
 'Units of Measure',
 'Local Site Name',
 'State_Name',
 'County_Name',
 'City_Name',
 'CBSA_Name',
 'Date_ of_Last_Change']

In [337]:
dfc[categorical].head(4)

Unnamed: 0,Parameter_Name,Sample_Duration,Pollutant_Standard,Date_Local,Units of Measure,Local Site Name,State_Name,County_Name,City_Name,CBSA_Name,Date_ of_Last_Change
0,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/1/2019,Parts per million,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
1,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/8/2019,Parts per million,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
2,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/9/2019,Parts per million,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020
3,Carbon monoxide,1 HOUR,CO 1-hour 1971,1/10/2019,Parts per million,Essex,Maryland,Baltimore,Essex,"Baltimore-Columbia-Towson, MD",10/2/2020


In [338]:
dfc.hvplot()

In [339]:
dfc[numerical].head(4)

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Observation_Count,Observation Percent,Arithmetic Mean,1st_Max_Value,1st_ Max_ Hour,AQI
0,24,5,3001,39.310833,-76.474444,2,8,0.2415,0.248,1,
1,24,5,3001,39.310833,-76.474444,21,88,0.29481,0.617,7,
2,24,5,3001,39.310833,-76.474444,23,96,0.15487,0.195,0,
3,24,5,3001,39.310833,-76.474444,24,100,0.146875,0.174,7,


In [340]:
dfc.hvplot()

In [341]:
data=dfc[['State_Name','City_Name','Latitude', 'Longitude','Observation Percent']]
data.head()

Unnamed: 0,State_Name,City_Name,Latitude,Longitude,Observation Percent
0,Maryland,Essex,39.310833,-76.474444,8
1,Maryland,Essex,39.310833,-76.474444,88
2,Maryland,Essex,39.310833,-76.474444,96
3,Maryland,Essex,39.310833,-76.474444,100
4,Maryland,Essex,39.310833,-76.474444,100


In [342]:
data.hvplot()

In [343]:
y = dfc['Observation_Count'] 

In [344]:
x = dfc[['State_Name','City_Name','Latitude', 'Longitude']]
x

Unnamed: 0,State_Name,City_Name,Latitude,Longitude
0,Maryland,Essex,39.310833,-76.474444
1,Maryland,Essex,39.310833,-76.474444
2,Maryland,Essex,39.310833,-76.474444
3,Maryland,Essex,39.310833,-76.474444
4,Maryland,Essex,39.310833,-76.474444
...,...,...,...,...
9021,District Of Columbia,Washington,38.894770,-76.953426
9022,District Of Columbia,Washington,38.894770,-76.953426
9023,District Of Columbia,Washington,38.894770,-76.953426
9024,District Of Columbia,Washington,38.894770,-76.953426


In [345]:
x.shape

(9026, 4)

In [346]:
y.shape

(9026,)

In [347]:
plt.show()

In [353]:
x_train,x_test ,Y_train,Y_test=train_test_split(x, y ,test_size=0.3, random_state=3) 

In [354]:
x

Unnamed: 0,State_Name,City_Name,Latitude,Longitude
0,Maryland,Essex,39.310833,-76.474444
1,Maryland,Essex,39.310833,-76.474444
2,Maryland,Essex,39.310833,-76.474444
3,Maryland,Essex,39.310833,-76.474444
4,Maryland,Essex,39.310833,-76.474444
...,...,...,...,...
9021,District Of Columbia,Washington,38.894770,-76.953426
9022,District Of Columbia,Washington,38.894770,-76.953426
9023,District Of Columbia,Washington,38.894770,-76.953426
9024,District Of Columbia,Washington,38.894770,-76.953426


In [359]:
x_train

Unnamed: 0,State_Name,City_Name,Latitude,Longitude
3506,Virginia,Springfield,38.768350,-77.183470
2392,Maryland,Beltsville,39.055277,-76.878333
8907,District Of Columbia,Washington,38.894770,-76.953426
8677,District Of Columbia,Washington,38.894770,-76.953426
2753,Virginia,Arlington,38.857700,-77.059220
...,...,...,...,...
2707,Virginia,Arlington,38.857700,-77.059220
8981,District Of Columbia,Washington,38.894770,-76.953426
6400,Virginia,Norfolk,36.855550,-76.301350
1688,Maryland,North Laurel,39.143130,-76.846110


In [None]:
# the concentration of CO in DC Metro Areas 

In [361]:
y.hvplot()