In [1]:
# install libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


In [4]:
# load dataset
df = pd.read_csv("../data/indian_weather_data.csv")
df.head()

Unnamed: 0,city,lat,lon,temperature,weather_code,sunrise,sunset,moonrise,moonset,co,...,wind_speed,wind_degree,wind_dir,pressure,precip,humidity,cloudcover,feelslike,uv_index,visibility
0,New Delhi,28.6,77.2,21,143,07:05 AM,05:26 PM,01:04 AM,01:06 PM,1411.85,...,4,34,NE,1017,0.0,53,50,21,0,1
1,Mumbai,18.975,72.826,30,122,07:03 AM,06:03 PM,01:20 AM,01:29 PM,644.85,...,18,300,WNW,1011,0.0,35,0,32,0,4
2,Kolkata,22.57,88.37,21,143,06:07 AM,04:54 PM,12:16 AM,12:23 PM,457.85,...,8,3,N,1014,0.0,73,0,21,0,3
3,Chennai,13.083,80.283,26,143,06:22 AM,05:44 PM,12:48 AM,01:00 PM,275.85,...,19,31,NNE,1012,0.0,65,25,28,0,5
4,Bengaluru,12.983,77.583,24,113,06:32 AM,05:55 PM,12:59 AM,01:11 PM,243.85,...,9,76,ENE,1015,0.0,25,0,24,0,10


In [5]:
# check columns and types
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   city          74 non-null     object 
 1   lat           74 non-null     float64
 2   lon           74 non-null     float64
 3   temperature   74 non-null     int64  
 4   weather_code  74 non-null     int64  
 5   sunrise       74 non-null     object 
 6   sunset        74 non-null     object 
 7   moonrise      74 non-null     object 
 8   moonset       74 non-null     object 
 9   co            74 non-null     float64
 10  no2           74 non-null     float64
 11  o3            74 non-null     int64  
 12  so2           74 non-null     float64
 13  pm2_5         74 non-null     float64
 14  pm10          74 non-null     float64
 15  wind_speed    74 non-null     int64  
 16  wind_degree   74 non-null     int64  
 17  wind_dir      74 non-null     object 
 18  pressure      74 non-null     in

Unnamed: 0,lat,lon,temperature,weather_code,co,no2,o3,so2,pm2_5,pm10,wind_speed,wind_degree,pressure,precip,humidity,cloudcover,feelslike,uv_index,visibility
count,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,24.303824,76.853392,22.689189,125.283784,526.498649,6.041892,169.256757,24.587838,48.952703,50.395946,7.297297,162.864865,1014.594595,0.002703,38.702703,24.297297,22.472973,0.0,7.432432
std,5.329883,6.824777,4.94885,19.601174,355.657039,5.530903,40.38651,15.250463,41.487027,42.537311,3.887915,130.955741,2.558461,0.02325,17.554401,36.581577,5.184865,0.0,3.314894
min,6.133,68.968,-1.0,113.0,132.85,0.85,39.0,1.95,5.75,5.85,4.0,2.0,1009.0,0.0,13.0,0.0,-6.0,0.0,1.0
25%,20.9415,73.1625,20.0,113.0,265.1,2.15,146.5,13.675,19.25,20.4,4.0,37.25,1013.0,0.0,26.25,0.0,20.0,0.0,5.0
50%,23.3085,75.8415,23.0,122.0,333.85,4.6,163.5,19.85,26.8,27.05,6.0,100.0,1015.0,0.0,35.0,0.0,23.0,0.0,10.0
75%,28.5925,77.658,26.0,137.75,775.1,8.3,198.75,31.15,76.0,80.25,9.0,305.5,1016.0,0.0,48.0,50.0,26.0,0.0,10.0
max,37.2,118.55,31.0,248.0,1591.85,25.55,264.0,76.65,138.25,141.95,19.0,357.0,1028.0,0.2,89.0,100.0,32.0,0.0,10.0


In [6]:
# standardize column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

In [7]:
df.replace(["N/A", "--", "", "None"], np.nan, inplace=True)
df.isna().sum()

city            0
lat             0
lon             0
temperature     0
weather_code    0
sunrise         0
sunset          0
moonrise        0
moonset         0
co              0
no2             0
o3              0
so2             0
pm2_5           0
pm10            0
wind_speed      0
wind_degree     0
wind_dir        0
pressure        0
precip          0
humidity        0
cloudcover      0
feelslike       0
uv_index        0
visibility      0
dtype: int64

In [10]:
# convert time columns
time_cols = ["sunrise", "sunset", "moonrise", "moonset"]
for col in time_cols:
    df[col] = pd.to_datetime(
        df[col],
        format="%H:%M",
        errors="coerce"
    )

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   city          74 non-null     object        
 1   lat           74 non-null     float64       
 2   lon           74 non-null     float64       
 3   temperature   74 non-null     int64         
 4   weather_code  74 non-null     int64         
 5   sunrise       74 non-null     datetime64[ns]
 6   sunset        74 non-null     datetime64[ns]
 7   moonrise      74 non-null     datetime64[ns]
 8   moonset       74 non-null     datetime64[ns]
 9   co            74 non-null     float64       
 10  no2           74 non-null     float64       
 11  o3            74 non-null     int64         
 12  so2           74 non-null     float64       
 13  pm2_5         74 non-null     float64       
 14  pm10          74 non-null     float64       
 15  wind_speed    74 non-null     int64       

In [13]:
# save cleaned dataset
df.to_csv("../data/weather_cleaned.csv", index=False)

In [14]:
# load clean dataset
df = pd.read_csv("../data/weather_cleaned.csv")
df.head()

Unnamed: 0,city,lat,lon,temperature,weather_code,sunrise,sunset,moonrise,moonset,co,...,wind_speed,wind_degree,wind_dir,pressure,precip,humidity,cloudcover,feelslike,uv_index,visibility
0,New Delhi,28.6,77.2,21,143,2025-12-25 07:05:00,2025-12-25 17:26:00,2025-12-25 01:04:00,2025-12-25 13:06:00,1411.85,...,4,34,NE,1017,0.0,53,50,21,0,1
1,Mumbai,18.975,72.826,30,122,2025-12-25 07:03:00,2025-12-25 18:03:00,2025-12-25 01:20:00,2025-12-25 13:29:00,644.85,...,18,300,WNW,1011,0.0,35,0,32,0,4
2,Kolkata,22.57,88.37,21,143,2025-12-25 06:07:00,2025-12-25 16:54:00,2025-12-25 00:16:00,2025-12-25 12:23:00,457.85,...,8,3,N,1014,0.0,73,0,21,0,3
3,Chennai,13.083,80.283,26,143,2025-12-25 06:22:00,2025-12-25 17:44:00,2025-12-25 00:48:00,2025-12-25 13:00:00,275.85,...,19,31,NNE,1012,0.0,65,25,28,0,5
4,Bengaluru,12.983,77.583,24,113,2025-12-25 06:32:00,2025-12-25 17:55:00,2025-12-25 00:59:00,2025-12-25 13:11:00,243.85,...,9,76,ENE,1015,0.0,25,0,24,0,10
