Running application

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/data assignment/merging'

%ls

!pip install streamlit

!wget -q -O - ipv4.icanhazip.com


!streamlit run app.py & npx localtunnel --port 8501

Merging used datasets

In [None]:
import pandas as pd

# Define the specific file paths and their location types
files_info = {
    "/content/merging/PRSA_Data_Dingling_20130301-20170228.csv": "Rural",
    "/content/merging/PRSA_Data_Gucheng_20130301-20170228.csv": "Suburban",
    "/content/merging/PRSA_Data_Nongzhanguan_20130301-20170228.csv": "Industrial",
    "/content/merging/PRSA_Data_Tiantan_20130301-20170228.csv": "Urban"
}

# Read and label each file
dataframes = []
for path, location in files_info.items():
    df = pd.read_csv(path)
    df['LocationType'] = location  # Add a column to tag location type
    dataframes.append(df)

# Merge all into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged dataset to your workspace
merged_df.to_csv("/content/merged_PRSA_data.csv", index=False)
missing = df.isnull().sum()

print("Files successfully merged and saved as 'merged_PRSA_data.csv'")


Displaying Data showing total errors

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the merged dataset
df = pd.read_csv('/content/drive/MyDrive/data assignment/merging/merged_PRSA_data.csv')


# Check how many missing values per column along with other info
missing = df.isnull().sum()

print("Number of rows and columns:\n", df.shape)
print("Column names and data types:\n", df.dtypes)
print("Sample of dataset:\n", df.head())
print("Missing values:\n", missing)

# Visualise the missing data
import missingno as msno
msno.matrix(df)
plt.show()


Hybrid clean up of data

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#missingno for null visualization
import missingno as msno

# Load dataset
df = pd.read_csv('/content/merged_PRSA_data.csv')
print("Initial shape:", df.shape)

# define data column types
critical_columns = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'wd']
weather_columns = ['RAIN', 'TEMP', 'DEWP', 'PRES', 'WSPM']

# Drop rows where important pollutants are missing
df = df.dropna(subset=critical_columns)

# Fill missing weather data with median value
for col in weather_columns:
    if col in df.columns:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)

# Double-check if any missing values left
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Visualise the missing data
import missingno as msno
msno.matrix(df)
plt.show()

df.to_csv('merged_cleaned_PRSA_data.csv', index=False)

print("\nData fully cleaned and saved")

EDA and correlation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load in the clean data
df = pd.read_csv('merged_cleaned_PRSA_data.csv')

print(df.info())

relevant_columns = [
    'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3',
    'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM'
]

x_labels = [
    "PM2.5 Concentration (μg/m³)",     # PM2.5
    "PM10 Concentration (μg/m³)",      # PM10
    "SO₂ Concentration (μg/m³)",       # SO2
    "NO₂ Concentration (μg/m³)",       # NO2
    "CO Concentration (mg/m³)",        # CO
    "O₃ Concentration (μg/m³)",        # O3
    "Temperature (°C)",                # TEMP
    "Pressure (hPa)",                  # PRES
    "Dew Point (°C)",                  # DEWP
    "Rainfall (mm)",                   # RAIN
    "Wind Speed (m/s)"                 # WSPM
]


#  Summary Statistics
print("\nSummary Statistics:\n")
print(df[relevant_columns].describe().round(2))

#  Histograms for distribution
print("\nHistograms:\n")
df[relevant_columns].hist(bins=30, figsize=(15, 10), color='skyblue', edgecolor='black')
plt.suptitle('Histograms of Relevant Variables', fontsize=18)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# First, import the function from scipy
from scipy.stats import skew

# Loop through and print skewness
for col in relevant_columns:
    skewness = skew(df[col].dropna())  # dropna() to avoid issues with missing values
    print(f"Skewness for {col}: {skewness:.2f}")

print("\n Insight: The distribution of pollutant levels (PM2.5, PM10, CO, NO2) is right-skewed, suggesting that extreme pollution events occur less frequently but have a significant impact.")

#  Boxplots to spot outliers
print("\nBoxplots:\n")
plots_per_row = 3

for i in range(0, len(relevant_columns), plots_per_row):
    fig, axes = plt.subplots(1, plots_per_row, figsize=(18, 5))  # 1 row, 3 columns

    for j in range(plots_per_row):
        if i + j < len(relevant_columns):
            sns.boxplot(data=df, x=relevant_columns[i + j], ax=axes[j])
            axes[j].set_title(f'Boxplot of {relevant_columns[i + j]}')
            axes[j].set_xlabel(x_labels[i + j])
        else:
            axes[j].set_visible(False)  # Hide empty subplots if any

    plt.tight_layout()
    plt.show()

print("\n Insight: The boxplots revealed that variables such as PM2.5, PM10, SO2, NO2, CO, O3, Rain, and WSPM exhibited a large number of outliers, especially \n toward higher values, indicating right-skewed distributions and significant variability."
"\n In contrast, meteorological variables such as Temperature (TEMP), Pressure (PRES), and Dew Point (DEWP) showed relatively symmetrical distributions with \n fewer outliers, suggesting more stable behavior across the dataset. This supports the previous histograms being right skewed")
# Make a smaller DataFrame with only the relevant columns
df_relevant = df[relevant_columns]

# Create the correlation matrix
correlation_matrix = df_relevant.corr()

# Plot the heatmap
print("\nCorrelation Heatmap:\n")
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Relevant Variables', fontsize=16)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("Insight: The heatmap reveals strong positive correlations between PM2.5 and PM10 (0.88) and between PM2.5 and CO (0.80), indicating that these pollutants \n often rise and fall together. Temperature shows a strong negative correlation with pollution levels.")

# Statistical Summary
df[relevant_columns].describe()

# Scatter plots
print("\nScatter Plots:\n")

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# Create the figure and axes
fig, axs = plt.subplots(2, 2, figsize=(16, 10))  # 2 rows x 2 cols

# Plot 1: PM2.5 vs PM10
sns.regplot(x='PM2.5', y='PM10', data=df, ax=axs[0, 0], scatter_kws={'s': 20}, line_kws={'color': 'red'})
axs[0, 0].set_title('PM2.5 vs PM10')

# Plot 2: PM2.5 vs CO
sns.regplot(x='PM2.5', y='CO', data=df, ax=axs[0, 1], scatter_kws={'s': 20}, line_kws={'color': 'red'})
axs[0, 1].set_title('PM2.5 vs CO')

# Plot 3: CO vs NO2
sns.regplot(x='CO', y='NO2', data=df, ax=axs[1, 0], scatter_kws={'s': 20}, line_kws={'color': 'red'})
axs[1, 0].set_title('CO vs NO2')

# Plot 4: CO vs PM10
sns.regplot(x='CO', y='PM10', data=df, ax=axs[1, 1], scatter_kws={'s': 20}, line_kws={'color': 'red'})
axs[1, 1].set_title('CO vs PM10')

# Tidy layout
plt.tight_layout()
plt.show()

print("Insight: The scatterplots provided insight into the relationships between key pollutant variables:"
"\n \n PM2.5 vs PM10: A strong positive linear relationship was observed, indicating that as PM10 levels increase, PM2.5 levels tend to increase similarly. This is expected, \n as both represent particulate matter of different sizes and often originate from similar sources."
"\n \n PM2.5 vs CO: A moderate positive correlation was noted, suggesting that higher concentrations of particulate matter are often associated with increased carbon monoxide \n levels, likely due to combustion-related pollution (e.g., traffic emissions)."
"\n \n CO vs NO2: Another moderate positive relationship was observed, implying that CO and NO2 may share common emission sources such as vehicle exhaust or industrial activities."
"\n \n SO2 vs PM2.5: The relationship appeared weaker and more scattered, indicating that sulfur dioxide levels are less directly related to PM2.5 concentrations in this dataset,\n possibly due to varying sources or atmospheric reactions affecting SO2 concentrations independently."
"\n \n Overall, the scatterplots highlight that several pollutants are interrelated, particularly particulate matter and gaseous pollutants associated with combustion activities. \n However, not all pollutants display strong linear relationships, reflecting the complex nature of air pollution dynamics.")