**General Goal:**
To understand the relationships between atmospheric parameters and pollution levels, as well as to identify potential spatial and temporal trends in pollution data.

**Hypothesis:**
There is a correlation between the concentrations of different pollutants (e.g., SO2, CO, NO2, O3) and certain atmospheric variables (e.g., cloud cover, aerosol indices, solar angles). Additionally, there might be discernible seasonal and regional patterns in pollution levels.

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [4]:
# Time frame of the data 
print('-----------------------------------------------------------------')
print(f'This data is from week {df.week_no.min()} of year {df.year.min()} to the week {df.week_no.max()} of year {df.year.max()}')
print('-----------------------------------------------------------------')

-----------------------------------------------------------------
This data is from week 0 of year 2019 to the week 52 of year 2021
-----------------------------------------------------------------


In [5]:
df.columns

Index(['ID_LAT_LON_YEAR_WEEK', 'latitude', 'longitude', 'year', 'week_no',
       'SulphurDioxide_SO2_column_number_density',
       'SulphurDioxide_SO2_column_number_density_amf',
       'SulphurDioxide_SO2_slant_column_number_density',
       'SulphurDioxide_cloud_fraction', 'SulphurDioxide_sensor_azimuth_angle',
       'SulphurDioxide_sensor_zenith_angle',
       'SulphurDioxide_solar_azimuth_angle',
       'SulphurDioxide_solar_zenith_angle',
       'SulphurDioxide_SO2_column_number_density_15km',
       'CarbonMonoxide_CO_column_number_density',
       'CarbonMonoxide_H2O_column_number_density',
       'CarbonMonoxide_cloud_height', 'CarbonMonoxide_sensor_altitude',
       'CarbonMonoxide_sensor_azimuth_angle',
       'CarbonMonoxide_sensor_zenith_angle',
       'CarbonMonoxide_solar_azimuth_angle',
       'CarbonMonoxide_solar_zenith_angle',
       'NitrogenDioxide_NO2_column_number_density',
       'NitrogenDioxide_tropospheric_NO2_column_number_density',
       'NitrogenDioxide

In [6]:
df.shape

(79023, 76)

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,79023.0,-1.891072,0.694522,-3.299000,-2.451000,-1.882000,-1.303000,-0.510000
longitude,79023.0,29.880155,0.810375,28.228000,29.262000,29.883000,30.471000,31.532000
year,79023.0,2020.000000,0.816502,2019.000000,2019.000000,2020.000000,2021.000000,2021.000000
week_no,79023.0,26.000000,15.297155,0.000000,13.000000,26.000000,39.000000,52.000000
SulphurDioxide_SO2_column_number_density,64414.0,0.000048,0.000272,-0.000996,-0.000096,0.000024,0.000153,0.004191
...,...,...,...,...,...,...,...,...
Cloud_sensor_azimuth_angle,78539.0,-10.784832,30.374462,-102.739731,-30.309170,-12.673914,9.402202,78.223037
Cloud_sensor_zenith_angle,78539.0,40.436976,6.428216,2.998873,35.829907,41.119630,44.446272,65.951248
Cloud_solar_azimuth_angle,78539.0,-86.800583,37.837269,-153.464211,-125.991158,-84.644352,-48.132701,-22.653170
Cloud_solar_zenith_angle,78539.0,27.925981,4.403835,10.818288,24.686763,28.333630,31.499883,42.060436


In [8]:
# number of unique stations 
combined_locations = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
print('-------------------------')
print('Number of stations : ',len(combined_locations.unique()))
print('-------------------------')

-------------------------
Number of stations :  497
-------------------------


In [12]:
combined_locations.value_counts()

-0.51_29.29      159
-2.257_30.243    159
-2.301_29.899    159
-2.3_29.2        159
-2.293_29.507    159
                ... 
-1.486_29.314    159
-1.482_30.618    159
-1.45_29.35      159
-1.444_30.856    159
-3.299_30.301    159
Length: 497, dtype: int64

#### i.e., 159 week data on all 497 locations. which means total 159*497 = 79023 rows of data 

#### -----------------------------------------------------------------------

In [16]:
print('No. of rows and columns in train dataset :', df.shape)

No. of rows and columns in train dataset : (79023, 76)
