# Exploratory Data Analysis

In [61]:
pwd

'c:\\Users\\wware\\Desktop\\UWA Bootcamp\\Challenges\\project_1\\Statistics'

## Data Preparation and Dependencies Setup

In [62]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import os
from scipy.stats import stats

# Study data files
seasonal_pollution_path = os.path.join("..","Resources", "Data", "seasonal_combined_df1_ww.csv")
monthly_pollution_path = os.path.join("..","Resources", "Data", "monthly_combined_df1_ww.csv")

# Read the data and the results

seasonal_pollution_df = pd.read_csv(seasonal_pollution_path)
monthly_pollution_df = pd.read_csv(monthly_pollution_path)

# Create output directory
stats_output_dir_season = 'Stats_Outputs/Season'
if not os.path.exists(stats_output_dir_season):
    os.makedirs(stats_output_dir_season)

stats_output_dir_month = 'Stats_Outputs/Month'
if not os.path.exists(stats_output_dir_month):
    os.makedirs(stats_output_dir_month)


## Seasonal EDA

In [63]:
seasonal_pollution_df.head()


Unnamed: 0,city,Hemisphere,Season,Year,Season_Year,co,no,no2,o3,so2,...,pm10,nh3,name,latitude,longitude,country,population,is_capital,is_rural,country_full
0,Aasiaat,Northern Hemisphere,Summer,2022,Summer_2022,179.718204,0.022079,0.313528,52.637019,0.220574,...,1.410569,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
1,Aasiaat,Northern Hemisphere,Autumn,2022,Autumn_2022,187.498562,0.001525,0.180261,70.555879,0.09342,...,1.397143,1.4e-05,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
2,Aasiaat,Northern Hemisphere,Winter,2022,Winter_2022,210.086533,0.0,0.104167,87.908289,0.120997,...,0.63058,0.000134,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
3,Aasiaat,Northern Hemisphere,Spring,2022,Spring_2022,227.892261,0.01885,0.165229,77.791464,0.286301,...,0.755046,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
4,Aasiaat,Northern Hemisphere,Summer,2023,Summer_2023,221.827736,0.021082,0.337785,48.390688,0.218066,...,1.421771,0.000983,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat


In [64]:
dtypes = seasonal_pollution_df.dtypes
dtypes

city             object
Hemisphere       object
Season           object
Year              int64
Season_Year      object
co              float64
no              float64
no2             float64
o3              float64
so2             float64
pm2_5           float64
pm10            float64
nh3             float64
name             object
latitude        float64
longitude       float64
country          object
population      float64
is_capital         bool
is_rural           bool
country_full     object
dtype: object

In [65]:
seasonal_pollution_df.describe()

Unnamed: 0,Year,co,no,no2,o3,so2,pm2_5,pm10,nh3,latitude,longitude,population
count,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3583.0
mean,2022.801496,280.663335,0.553074,3.996631,50.799707,2.29071,10.914334,23.683782,1.625098,18.546498,11.149274,163521.8
std,0.74833,162.31328,1.773417,6.70826,21.975908,6.806494,20.658813,55.929085,4.038999,33.077678,90.903671,593643.1
min,2022.0,136.984537,0.0,0.005763,-1.666304,0.012097,0.5,-6.560791,0.0,-54.8072,-178.158,10.0
25%,2022.0,209.020861,0.022329,0.505223,34.793403,0.240355,1.938362,2.495681,0.071611,-8.0,-71.7353,11470.0
50%,2023.0,234.757592,0.072113,1.324841,49.938478,0.608842,4.147857,6.664513,0.343797,22.9125,16.5189,25085.0
75%,2023.0,277.27023,0.255401,4.159782,66.840304,1.658761,9.333122,15.911112,1.320414,46.1944,88.1166,108860.0
max,2024.0,1800.54689,27.97381,55.99244,162.001292,98.496577,244.71869,633.222727,57.549971,71.6269,177.517,7674439.0


In [66]:
# Check for missing values
print(seasonal_pollution_df.isnull().sum())

city              0
Hemisphere        0
Season            0
Year              0
Season_Year       0
co                0
no                0
no2               0
o3                0
so2               0
pm2_5             0
pm10              0
nh3               0
name              0
latitude          0
longitude         0
country          20
population      160
is_capital        0
is_rural          0
country_full      0
dtype: int64


In [67]:
# Checking the number of mice.
number_of_cities=seasonal_pollution_df['city'].nunique()
number_of_cities

373

In [68]:
# Create a new column by combining 'city' and 'Season_Year' with an underscore
seasonal_pollution_df['city_season'] = seasonal_pollution_df['city'] + '_' + seasonal_pollution_df['Season_Year']

# Check the first few rows to ensure the new column is correct
print(seasonal_pollution_df[['city_season']].head())



           city_season
0  Aasiaat_Summer_2022
1  Aasiaat_Autumn_2022
2  Aasiaat_Winter_2022
3  Aasiaat_Spring_2022
4  Aasiaat_Summer_2023


In [69]:
entry_count = len(seasonal_pollution_df['city_season'])
entry_count

3743

In [70]:
# Get the duplicate city
duplicates=seasonal_pollution_df[seasonal_pollution_df.duplicated(subset=["city", "Season_Year"], keep=False)]
duplicates['city'].unique()

array(['Ciudad Lazaro Cardenas', 'Las Palmas'], dtype=object)

In [71]:
duplicated_rows=seasonal_pollution_df[seasonal_pollution_df.duplicated(subset=['city', 'Season_Year'], keep=False)]

print('Duplicated entries: ', len(duplicated_rows))
print(duplicated_rows.head())


Duplicated entries:  40
                       city           Hemisphere  Season  Year  Season_Year  \
680  Ciudad Lazaro Cardenas  Northern Hemisphere  Summer  2022  Summer_2022   
681  Ciudad Lazaro Cardenas  Northern Hemisphere  Summer  2022  Summer_2022   
682  Ciudad Lazaro Cardenas  Northern Hemisphere  Autumn  2022  Autumn_2022   
683  Ciudad Lazaro Cardenas  Northern Hemisphere  Autumn  2022  Autumn_2022   
684  Ciudad Lazaro Cardenas  Northern Hemisphere  Winter  2022  Winter_2022   

             co        no        no2         o3        so2  ...       nh3  \
680  537.329685  1.598352  10.707319  35.977694  17.737759  ...  0.215560   
681  537.329685  1.598352  10.707319  35.977694  17.737759  ...  0.215560   
682  546.952225  1.675875  11.705678  37.859734  19.092701  ...  0.287976   
683  546.952225  1.675875  11.705678  37.859734  19.092701  ...  0.287976   
684  514.248884  0.852381  13.757188  54.377932  18.822187  ...  0.328869   

                       name  latitude 

In [72]:
# Dropping duplicates, keeping the first entry
seasonal_pollution_clean_df = seasonal_pollution_df.drop_duplicates(subset=['city', 'Season_Year'], keep='first')
seasonal_pollution_clean_df.head()

Unnamed: 0,city,Hemisphere,Season,Year,Season_Year,co,no,no2,o3,so2,...,nh3,name,latitude,longitude,country,population,is_capital,is_rural,country_full,city_season
0,Aasiaat,Northern Hemisphere,Summer,2022,Summer_2022,179.718204,0.022079,0.313528,52.637019,0.220574,...,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat,Aasiaat_Summer_2022
1,Aasiaat,Northern Hemisphere,Autumn,2022,Autumn_2022,187.498562,0.001525,0.180261,70.555879,0.09342,...,1.4e-05,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat,Aasiaat_Autumn_2022
2,Aasiaat,Northern Hemisphere,Winter,2022,Winter_2022,210.086533,0.0,0.104167,87.908289,0.120997,...,0.000134,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat,Aasiaat_Winter_2022
3,Aasiaat,Northern Hemisphere,Spring,2022,Spring_2022,227.892261,0.01885,0.165229,77.791464,0.286301,...,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat,Aasiaat_Spring_2022
4,Aasiaat,Northern Hemisphere,Summer,2023,Summer_2023,221.827736,0.021082,0.337785,48.390688,0.218066,...,0.000983,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat,Aasiaat_Summer_2023


In [73]:
# Checking the number of mice.
number_of_cities_no_dups=seasonal_pollution_clean_df['city'].nunique()
number_of_cities_no_dups

373

In [74]:
entry_count = len(seasonal_pollution_clean_df['city_season'])
entry_count

3723

In [84]:
# List of columns to plot
pollutants = ['co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3']

# Create a histogram for each column
for pollutant in pollutants:
    plt.figure()
    seasonal_pollution_df[pollutant].plot(kind='hist', bins=20)
    plt.xlabel(pollutant)
    plt.ylabel('Frequency')
    plt.title(f'Histogram of global {pollutant}')
    plt.savefig(f'{stats_output_dir_season}/{pollutant}_histogram_global.png')
    plt.close()


In [85]:
seasons = seasonal_pollution_clean_df['Season'].unique()


In [86]:
# Optional: Define a color map for different seasons for visual clarity
colors = ['blue', 'green', 'red', 'purple']  # One color for each season
season_color = {season: color for season, color in zip(seasons, colors)}

for pollutant in pollutants:
    plt.figure(figsize=(10, 6))  # Create a figure for each pollutant

    for season in seasons:
        season_data = seasonal_pollution_df[seasonal_pollution_df['Season'] == season][pollutant].dropna()
        plt.hist(season_data, bins=20, histtype='step', linewidth=1.5, label=season, color=season_color[season])

    plt.title(f'Global Histogram of {pollutant.capitalize()} by Season')
    plt.xlabel(f'{pollutant.capitalize()} Concentration')
    plt.ylabel('Frequency')
    plt.legend(title='Season')
    plt.grid(True)  # Optional: Turn on grid for better readability

    # Save the plot in the output directory
    plt.savefig(f'{stats_output_dir_season}/{pollutant}_global_histogram_by_season.png')
    plt.close()  # Close the plot to free up memory

In [78]:
pollutants = ['co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3']
for pollutant in pollutants:
    seasonal_pollution_clean_df[pollutant] = pd.to_numeric(seasonal_pollution_clean_df[pollutant], errors='coerce')

mean_pollution = seasonal_pollution_clean_df.groupby(['Season'])[pollutants].mean()
max_pollution = seasonal_pollution_clean_df.groupby(['Season'])[pollutants].max()
min_pollution = seasonal_pollution_clean_df.groupby(['Season'])[pollutants].min()
std_pollution = seasonal_pollution_clean_df.groupby(['Season'])[pollutants].std()

# Concatenate the statistics DataFrames along columns
stats_by_season = pd.concat([
    mean_pollution.add_suffix('_Mean'),
    max_pollution.add_suffix('_Max'),
    min_pollution.add_suffix('_Min'),
    std_pollution.add_suffix('_Std')
], axis=1)

# Transpose the DataFrame
transposed_stats_by_season = stats_by_season.T

# Sort the DataFrame by the index directly if 'Season' is the index
sorted_transposed_stats_by_season = transposed_stats_by_season.sort_index(ascending=True)

# Print the sorted DataFrame
print(sorted_transposed_stats_by_season)

Season           Autumn       Spring       Summer       Winter
co_Max      1772.905386  1584.519647  1188.859524  1800.546890
co_Mean      278.781852   276.025633   245.072885   315.990026
co_Min       138.984889   165.574386   136.984537   172.297032
co_Std       165.874353   130.497130   116.196638   208.158210
nh3_Max       26.433581    57.549971    16.232056    35.038651
nh3_Mean       1.280894     2.177450     0.978113     1.952975
nh3_Min        0.000000     0.000000     0.000000     0.000000
nh3_Std        3.056192     5.376312     2.029203     4.432253
no2_Max       43.326181    44.558658    34.676793    55.992440
no2_Mean       3.899885     3.583766     2.372448     5.840046
no2_Min        0.007333     0.007569     0.005763     0.007536
no2_Std        6.429576     5.663821     4.155236     8.911006
no_Max        13.639890    11.231832    16.145472    27.973810
no_Mean        0.562781     0.404419     0.299881     0.910550
no_Min         0.000000     0.000014     0.000000     0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seasonal_pollution_clean_df[pollutant] = pd.to_numeric(seasonal_pollution_clean_df[pollutant], errors='coerce')


In [79]:
# IQR plots

# Output file path
txt_file_path = os.path.join(stats_output_dir_season, 'IQR.txt')

# Open the output file and write to it
with open(txt_file_path, 'w') as txt_file:
    # Loop over seasons
    for season in seasonal_pollution_clean_df['Season'].unique():
        txt_file.write(f"Season: {season}\n")
        # Filter the data for the current season
        season_data = seasonal_pollution_clean_df[seasonal_pollution_clean_df['Season'] == season]

        # Calculate the IQR for each pollutant for the current season
        for pollutant in pollutants:
            if pollutant in season_data.columns:
                clean_data = season_data[pollutant].dropna()
                iqr = stats(clean_data)
                txt_file.write(f"{pollutant.capitalize()} IQR: {iqr:.2f}\n")
        txt_file.write("\n")  # Add a new line after each season
    



TypeError: 'module' object is not callable

In [None]:
import plotly.express as px

# Assuming 'mean_pollution' is already computed and properly structured
fig = px.bar(mean_pollution.reset_index(), x='Season', y='co', color='city', barmode='group',
             title="Average CO Levels by Season and City")
fig.show()

In [None]:
print(mean_pollution.head())
print(mean_pollution.dtypes)

                                   co        no       no2         o3  \
Season city                                                            
Autumn Aasiaat             222.287983  0.001397  0.151068  74.615141   
       Abeokuta            993.953183  0.280750  6.633039  27.463079   
       Acapulco De Juarez  354.906511  0.705208  8.333388  44.469840   
       Acarau              266.098994  0.038611  0.988350  32.149035   
       Achacachi           191.997063  0.038569  0.889280  18.728054   

                                so2      pm2_5       pm10       nh3  
Season city                                                          
Autumn Aasiaat             0.084125   0.569967   1.316548  0.002326  
       Abeokuta            4.852088  37.339746  70.525028  4.712871  
       Acapulco De Juarez  1.822704   7.880881   9.530073  0.915050  
       Acarau              0.244534   2.222610   3.482340  0.153132  
       Achacachi           0.423257   2.864980   3.974421  0.646089  
co   

In [None]:
# Select the top 10 cities based on CO levels, replace 'co' with the correct column name if different
top_cities = mean_pollution.nlargest(10, 'co').index

# Print the top cities to verify
print("Top cities based on CO levels:", top_cities)

Top cities based on CO levels: MultiIndex([('Winter',          'Mandi Bahauddin'),
            ('Autumn',                 'Makassar'),
            ('Winter',                 'Makassar'),
            ('Winter',                 'Sargodha'),
            ('Winter',                   'Rampur'),
            ('Spring',                 'Makassar'),
            ('Winter', 'San Pablo De Las Salinas'),
            ('Autumn',                 'Kingston'),
            ('Autumn',                   'Rampur'),
            ('Winter',                'Balurghat')],
           names=['Season', 'city'])


In [None]:
# Filter your main DataFrame to include only the top cities
filtered_data = seasonal_pollution_df[seasona['city'].isin(top_cities)]

# Assuming your DataFrame has a 'Season' column, and you want to visualize CO levels
import matplotlib.pyplot as plt

# Calculate seasonal means for the filtered top cities
seasonal_means = filtered_data.groupby(['Season', 'city'])['co'].mean().unstack()

# Plotting
seasonal_means.plot(kind='bar', figsize=(14, 7))
plt.title('Average CO Levels by Season for Top 10 Cities')
plt.xlabel('Season')
plt.ylabel('Average CO Level')
plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()



NameError: name 'df' is not defined

## Monthly EDA

In [None]:
monthly_pollution_df.head()

Unnamed: 0,city,Hemisphere,Month,Year,Month_Year,co,no,no2,o3,so2,...,pm10,nh3,name,latitude,longitude,country,population,is_capital,is_rural,country_full
0,Aasiaat,Northern Hemisphere,4,2022,04_2022,252.09,0.014762,0.100952,81.232381,0.32,...,1.207143,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
1,Aasiaat,Northern Hemisphere,5,2022,05_2022,227.209261,0.018965,0.167043,77.694341,0.285349,...,0.742285,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
2,Aasiaat,Northern Hemisphere,6,2022,06_2022,191.287153,0.025917,0.236306,59.547097,0.188167,...,1.123806,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
3,Aasiaat,Northern Hemisphere,7,2022,07_2022,169.996394,0.026724,0.390618,50.260647,0.26852,...,1.830647,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat
4,Aasiaat,Northern Hemisphere,8,2022,08_2022,177.617043,0.014019,0.316142,48.172903,0.207083,...,1.295108,0.0,Aasiaat,68.7097,-52.8694,GL,3134.0,False,True,Kalaallit Nunaat


In [None]:
dtypes = monthly_pollution_df.dtypes
dtypes

city             object
Hemisphere       object
Month             int64
Year              int64
Month_Year       object
co              float64
no              float64
no2             float64
o3              float64
so2             float64
pm2_5           float64
pm10            float64
nh3             float64
name             object
latitude        float64
longitude       float64
country          object
population      float64
is_capital         bool
is_rural           bool
country_full     object
dtype: object