In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


#Load CSV file
df = pd.read_csv("Air_Quality.csv")
print("Original Data:\n", df.head())


#Data Cleaning
#Check for missing values
print("\nMissing Values:\n", df.isnull().sum())
#Fill missing values
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)
#Remove duplicate rows
df.drop_duplicates(inplace=True)
#Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


#Data Preprocessing
#Encode categorical variables (One-hot encoding)
cat_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=cat_cols)
#Scale numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


# Step 4: Save Cleaned Data
df.to_csv("cleaned_preprocessed_data.csv", index=False)
print("\n✅ Cleaned and preprocessed data saved to 'cleaned_preprocessed_data.csv'")
print("\nFinal Processed Data:\n", df.head())


Original Data:
    Unique ID  Indicator ID                    Name Measure Measure Info  \
0     336867           375  Nitrogen dioxide (NO2)    Mean          ppb   
1     336741           375  Nitrogen dioxide (NO2)    Mean          ppb   
2     550157           375  Nitrogen dioxide (NO2)    Mean          ppb   
3     412802           375  Nitrogen dioxide (NO2)    Mean          ppb   
4     412803           375  Nitrogen dioxide (NO2)    Mean          ppb   

  Geo Type Name  Geo Join ID                     Geo Place Name  \
0            CD          407      Flushing and Whitestone (CD7)   
1            CD          107              Upper West Side (CD7)   
2            CD          414  Rockaway and Broad Channel (CD14)   
3            CD          407      Flushing and Whitestone (CD7)   
4            CD          407      Flushing and Whitestone (CD7)   

           Time Period  Start_Date  Data Value  Message  
0       Winter 2014-15  12/01/2014       23.97      NaN  
1       Winter

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))



✅ Cleaned and preprocessed data saved to 'cleaned_preprocessed_data.csv'

Final Processed Data:
    unique_id  indicator_id  geo_join_id  data_value  message  \
0   0.276123      0.033784     0.000004    0.056440      NaN   
1   0.275954      0.033784     0.000001    0.064563      NaN   
2   0.561308      0.033784     0.000004    0.029550      NaN   
3   0.377654      0.033784     0.000004    0.053285      NaN   
4   0.377655      0.033784     0.000004    0.032964      NaN   

   name_Annual vehicle miles traveled  \
0                               False   
1                               False   
2                               False   
3                               False   
4                               False   

   name_Annual vehicle miles traveled (cars)  \
0                                      False   
1                                      False   
2                                      False   
3                                      False   
4                             