In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("city_day.csv")


In [None]:
df.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
Xylene,18109
PM10,11140
NH3,10328
Toluene,8041
Benzene,5623
AQI,4681
AQI_Bucket,4681
PM2.5,4598
NOx,4185
O3,4022


In [None]:
df_cleaned = df.drop(columns=["Xylene", "Toluene"], errors="ignore")

In [None]:
numeric_cols = df_cleaned.select_dtypes(include="number").columns

In [None]:
for col in numeric_cols:
    df_cleaned[col] = df_cleaned.groupby("City")[col].transform(lambda x: x.fillna(x.median()))

In [None]:
df_cleaned

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,58.37,107.96,0.92,18.22,17.15,,0.92,27.64,133.36,0.00,384.5,
1,Ahmedabad,2015-01-02,58.37,107.96,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,384.5,
2,Ahmedabad,2015-01-03,58.37,107.96,17.40,19.30,29.70,,17.40,29.07,30.70,6.80,384.5,
3,Ahmedabad,2015-01-04,58.37,107.96,1.70,18.48,17.97,,1.70,18.59,36.08,4.43,384.5,
4,Ahmedabad,2015-01-05,58.37,107.96,22.10,21.42,37.76,,22.10,39.33,39.31,7.01,384.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,54.0,Satisfactory


In [None]:
categorical_cols = df_cleaned.select_dtypes(include="object").columns
for col in categorical_cols:
    mode_val = df_cleaned[col].mode()
    if not mode_val.empty:
        df_cleaned[col].fillna(mode_val[0], inplace=True)
    else:
        df_cleaned[col].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(mode_val[0], inplace=True)


In [None]:
df_final = df_cleaned.dropna()

In [None]:
df_final.to_csv("city_day_cleaned(1).csv", index=False)

In [None]:
original_rows = len(df)
cleaned_rows = len(df_final)
data_retained = (cleaned_rows / original_rows) * 100

In [None]:
print(f"Original rows: {original_rows}")
print(f"Cleaned rows: {cleaned_rows}")
print(f"Data retained: {data_retained:.2f}%")
print("✅ Cleaned dataset saved as 'city_day_cleaned(1).csv'")

Original rows: 29531
Cleaned rows: 22619
Data retained: 76.59%
✅ Cleaned dataset saved as 'city_day_cleaned(1).csv'


In [None]:
df = pd.read_csv("stations.csv")

In [None]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   StationId    230 non-null    object
 1   StationName  230 non-null    object
 2   City         230 non-null    object
 3   State        230 non-null    object
 4   Status       133 non-null    object
dtypes: object(5)
memory usage: 9.1+ KB


(None,
   StationId                                     StationName  \
 0     AP001                  Secretariat, Amaravati - APPCB   
 1     AP002  Anand Kala Kshetram, Rajamahendravaram - APPCB   
 2     AP003                      Tirumala, Tirupati - APPCB   
 3     AP004                 PWD Grounds, Vijayawada - APPCB   
 4     AP005          GVM Corporation, Visakhapatnam - APPCB   
 
                 City           State  Status  
 0          Amaravati  Andhra Pradesh  Active  
 1  Rajamahendravaram  Andhra Pradesh     NaN  
 2           Tirupati  Andhra Pradesh     NaN  
 3         Vijayawada  Andhra Pradesh     NaN  
 4      Visakhapatnam  Andhra Pradesh  Active  )

In [None]:
df.isnull().sum().sort_values(ascending=False)


Unnamed: 0,0
Status,97
StationId,0
StationName,0
City,0
State,0


In [None]:
df["Status"].fillna("Unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Status"].fillna("Unknown", inplace=True)


In [None]:
df.drop_duplicates(subset=["StationId"], inplace=True)


In [None]:
text_cols = ["StationId", "StationName", "City", "State", "Status"]
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.title()


In [None]:
print(df["Status"].unique())
print(df["State"].unique()[:10])  # preview first 10


['Active' 'Unknown' 'Inactive']
['Andhra Pradesh' 'Assam' 'Bihar' 'Chandigarh' 'Delhi' 'Gujarat' 'Haryana'
 'Jharkhand' 'Karnataka' 'Kerala']


In [None]:
stations_per_state = df.groupby("State")["StationId"].count().reset_index()


In [None]:
df.to_csv("stations_cleaned(1).csv", index=False)


In [None]:
print("✅ Cleaned dataset saved as 'stations_cleaned(1).csv'")

✅ Cleaned dataset saved as 'stations_cleaned(1).csv'
