In [1]:
import pandas as pd
import glob

In [None]:
# Step 1: Find all CSV files
file_list = glob.glob("data/*.csv")

# Step 2: Read and combine all files
df_list = []
for file in file_list:
    temp_df = pd.read_csv(file)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)

# Step 3: Keep only essential columns
df = df[["Date", "Daily Mean PM2.5 Concentration"]].copy()

# Step 4: Convert the Date column to datetime format
df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y")

# Step 5: Rename columns for consistency
df.rename(columns={"Daily Mean PM2.5 Concentration": "PM2.5"}, inplace=True)

# Step 6: Aggregate duplicate dates by taking the mean value
df = df.groupby("Date", as_index=False).mean()

# Step 7: Sort data by date
df = df.sort_values("Date")

# Step 8: Set Date as index and ensure daily frequency
df = df.set_index("Date").asfreq("D")

# Step 9: Handle missing values using linear interpolation
df["PM2.5"] = df["PM2.5"].interpolate(method="linear")

# Step 10: Export the cleaned dataset
df.to_csv("data/pm2.5_cleaned.csv")

print("✅ Data cleaned and saved as 'pm2.5_cleaned.csv'")
print("Final dataset shape:", df.shape)
print(df.head())



✅ Data cleaned and saved as 'pm2.5_cleaned.csv'
Final dataset shape: (2557, 1)
            PM2.5
Date             
2018-01-01  63.35
2018-01-02  43.88
2018-01-03  16.25
2018-01-04  25.55
2018-01-05  18.50
