In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import io

# --- Simulate the "City_Air_Quality.csv" file ---
# We create a mock CSV in memory to make this script runnable.
# It contains 14 days of hourly data.
dates = pd.date_range('2023-01-01', periods=14 * 24, freq='H')
data = {
    "Date": dates.date,
    "Time": dates.time,
    "PM2.5": np.abs(np.random.normal(30, 15, len(dates))) + np.sin(np.arange(len(dates)) * 0.1) * 10,
    "PM10": np.abs(np.random.normal(50, 20, len(dates))) + np.sin(np.arange(len(dates)) * 0.1) * 15,
    "CO": np.abs(np.random.normal(0.8, 0.3, len(dates))),
    "AQI": np.abs(np.random.normal(70, 25, len(dates))) + np.sin(np.arange(len(dates)) * 0.1) * 20
}
# Add some missing values
df_mock = pd.DataFrame(data)
df_mock.loc[df_mock.sample(frac=0.05).index, 'PM2.5'] = np.nan
df_mock.loc[df_mock.sample(frac=0.05).index, 'AQI'] = np.nan

csv_data = df_mock.to_csv(index=False)
# ----------------------------------------------------

# In a real case, you would use this line instead of the code above:
# file_path = "City_Air_Quality.csv"
# df = pd.read_csv(file_path)

# Task 1: Import the dataset (from the simulated CSV data)
csv_file = io.StringIO(csv_data)
df = pd.read_csv(csv_file)

# Task 2: Explore the dataset
print("--- 2. Initial Data Exploration ---")
print("Shape:", df.shape)
print("\nHead:\n", df.head())
print("\nInfo:")
df.info()

# --- Data Preparation ---
# Task 3: Identify variables and preprocess
print("\n--- 3. Preprocessing Data ---")
# Combine Date and Time into a single 'DateTime' index
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df.set_index('DateTime', inplace=True)

# Handle missing values (e.g., forward fill for time-series)
df.fillna(method='ffill', inplace=True)
print("Missing values handled.")
print(df.describe())

# Relevant variables are: df.index (time), 'PM2.5', 'PM10', 'CO', 'AQI'

# Task 4: Create line plot for overall AQI trend
print("\n--- 4. Creating AQI Trend Line Plot ---")
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['AQI'], label='AQI', color='red', linewidth=2)
plt.title('Overall Air Quality Index (AQI) Trend Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('AQI Value', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('aqi_trend.png')
plt.close()
print("Saved 'aqi_trend.png'")

# Task 5: Plot individual pollutant trends (using subplots)
print("--- 5. Creating Individual Pollutant Line Plots ---")
pollutants = ['PM2.5', 'PM10', 'CO']
colors = ['blue', 'green', 'purple']
y_labels = ['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'CO (ppm)']

# Create a figure with 3 subplots, sharing the x-axis
fig, axes = plt.subplots(3, 1, figsize=(12, 10), sharex=True)

for ax, poll, color, label in zip(axes, pollutants, colors, y_labels):
    ax.plot(df.index, df[poll], label=poll, color=color)
    ax.set_ylabel(label, fontsize=10)
    ax.legend(loc='upper left')
    ax.grid(True, linestyle='--', alpha=0.5)

# Add common labels and title
fig.suptitle('Individual Pollutant Trends Over Time', fontsize=18)
axes[-1].set_xlabel('Date', fontsize=12) # Set x-label only on the bottom plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout for suptitle
plt.savefig('pollutant_trends.png')
plt.close()
print("Saved 'pollutant_trends.png'")

# Task 6: Use bar plot to compare AQI values
print("--- 6. Creating Daily Average AQI Bar Plot ---")
# Aggregate data to daily averages for a clearer bar plot
daily_aqi = df['AQI'].resample('D').mean()

plt.figure(figsize=(12, 6))
plt.bar(daily_aqi.index, daily_aqi.values, color='teal', width=0.7)
plt.title('Average Daily AQI Comparison', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Average AQI Value', fontsize=12)

# Format x-axis to show dates clearly
ax = plt.gca()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('daily_aqi_bar.png')
plt.close()
print("Saved 'daily_aqi_bar.png'")

# Task 7: Create box plots for pollutant distributions
print("--- 7. Creating Pollutant Distribution Box Plots ---")
data_to_plot = [df['AQI'], df['PM2.5'], df['PM10'], df['CO']]
labels = ['AQI', 'PM2.5', 'PM10', 'CO']

# We use subplots because the scales are very different
fig, axes = plt.subplots(1, 4, figsize=(16, 6))

axes[0].boxplot(df['AQI'].dropna())
axes[0].set_title('AQI Distribution', fontsize=14)
axes[0].set_ylabel('AQI Value', fontsize=12)

axes[1].boxplot(df['PM2.5'].dropna())
axes[1].set_title('PM2.5 Distribution', fontsize=14)
axes[1].set_ylabel('PM2.5 (µg/m³)', fontsize=12)

axes[2].boxplot(df['PM10'].dropna())
axes[2].set_title('PM10 Distribution', fontsize=14)
axes[2].set_ylabel('PM10 (µg/m³)', fontsize=12)

axes[3].boxplot(df['CO'].dropna())
axes[3].set_title('CO Distribution', fontsize=14)
axes[3].set_ylabel('CO (ppm)', fontsize=12)

fig.suptitle('Distribution of AQI and Key Pollutants', fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.93])
plt.savefig('pollutant_boxplots.png')
plt.close()
print("Saved 'pollutant_boxplots.png'")

# Task 8: Use scatter plot to explore relationships
print("--- 8. Creating AQI vs. PM2.5 Scatter Plot ---")
plt.figure(figsize=(10, 6))
# 'alpha' is used to show density in case of overplotting
plt.scatter(df['PM2.5'], df['AQI'], alpha=0.5, color='orange', s=15)
plt.title('Relationship between PM2.5 and AQI', fontsize=16)
plt.xlabel('PM2.5 (µg/m³)', fontsize=12)
plt.ylabel('AQI Value', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('scatter_pm25_aqi.png')
plt.close()
print("Saved 'scatter_pm25_aqi.png'")

# Task 9: Customization was performed in all plots above.
print("\n--- All visualizations saved as .png files in the current directory. ---")

  dates = pd.date_range('2023-01-01', periods=14 * 24, freq='H')
  df.fillna(method='ffill', inplace=True)


--- 2. Initial Data Exploration ---
Shape: (336, 6)

Head:
          Date      Time      PM2.5       PM10        CO        AQI
0  2023-01-01  00:00:00  43.105496  45.148547  0.650307  87.877301
1  2023-01-01  01:00:00  71.168322  54.116091  0.949365  60.505802
2  2023-01-01  02:00:00  40.878845  35.219266  0.541055  39.189635
3  2023-01-01  03:00:00  38.330137  50.866625  0.363974  94.374722
4  2023-01-01  04:00:00   4.279979  35.503883  1.122157  38.473859

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    336 non-null    object 
 1   Time    336 non-null    object 
 2   PM2.5   319 non-null    float64
 3   PM10    336 non-null    float64
 4   CO      336 non-null    float64
 5   AQI     319 non-null    float64
dtypes: float64(4), object(2)
memory usage: 15.9+ KB

--- 3. Preprocessing Data ---
Missing values handled.
            PM2.5      