In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

# Step 1: Load the dataset
url = './AirQuality.csv'
air_quality = pd.read_csv(url)

# Step 2: Data Preprocessing
# Convert 'Time' to datetime format
air_quality['Time'] = pd.to_datetime(air_quality['Time'])

# Handle missing values by filling with the column mean
air_quality.fillna(air_quality.mean(), inplace=True)

# For demonstration, create a 'station' column with random categories
np.random.seed(0)
air_quality['station'] = np.random.choice(['Station A', 'Station B', 'Station C'], size=len(air_quality))

# Step 3: Basic Line Plot for PM2.5
plt.figure(figsize=(14, 7))
sns.lineplot(data=air_quality, x='Time', y='PM2.5')
plt.title('PM2.5 Concentration Over Time')
plt.xlabel('Time')
plt.ylabel('PM2.5 (µg/m³)')
plt.tight_layout()
plt.show()

# Step 4: Line Plot with Confidence Intervals (Standard Deviation)
plt.figure(figsize=(14, 7))
sns.lineplot(data=air_quality, x='Time', y='TEMP', ci=None)  # Remove ci='sd' which is not valid
plt.title('Temperature Over Time')
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.tight_layout()
plt.show()

# Step 5: Smoothed Time Series Plot for PM2.5
air_quality['PM2.5_Smoothed'] = air_quality['PM2.5'].rolling(window=24).mean()

plt.figure(figsize=(14, 7))
sns.lineplot(data=air_quality, x='Time', y='PM2.5_Smoothed', label='Smoothed PM2.5')
sns.lineplot(data=air_quality, x='Time', y='PM2.5', alpha=0.5, label='Original PM2.5')
plt.title('Smoothed vs. Original PM2.5 Concentration Over Time')
plt.xlabel('Time')
plt.ylabel('PM2.5 (µg/m³)')
plt.legend()
plt.tight_layout()
plt.show()

# Step 6: Grouped Line Plot by Monitoring Stations
plt.figure(figsize=(14, 7))
sns.lineplot(data=air_quality, x='Time', y='PM2.5', hue='station', style='station')
plt.title('PM2.5 Concentration Over Time by Monitoring Station')
plt.xlabel('Time')
plt.ylabel('PM2.5 (µg/m³)')
plt.legend(title='Station')
plt.tight_layout()
plt.show()

# Step 7: Distribution of PM2.5 Concentrations by Station
plt.figure(figsize=(14, 7))
sns.boxplot(data=air_quality, x='station', y='PM2.5')
plt.title('PM2.5 Concentration Distribution by Station')
plt.xlabel('Station')
plt.ylabel('PM2.5 (µg/m³)')
plt.tight_layout()
plt.show()

# Step 8: Correlation Heatmap for Air Quality Metrics
correlation_data = air_quality[['PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']]
correlation_matrix = correlation_data.corr()

plt.figure(figsize=(14, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Air Quality Metrics')
plt.tight_layout()
plt.show()

# Step 9: Time Series Decomposition of PM2.5
air_quality.set_index('Time', inplace=True)

# Handle missing data before decomposition
air_quality['PM2.5'] = air_quality['PM2.5'].interpolate()  # Interpolate missing data

result = seasonal_decompose(air_quality['PM2.5'], model='multiplicative', period=24)
result.plot()
plt.tight_layout()
plt.show()


  air_quality['Time'] = pd.to_datetime(air_quality['Time'])


TypeError: can only concatenate str (not "int") to str