In [1]:
!pip install pandas



# Import and Explore the Data
- Our data are purchased from [OpenWeather](https://home.openweathermap.org/history_bulks/new)
- Upon exploring the data, we find that the variable "temp_max" refers to the "maximum temperature within a large city or a megalopolis" at a given point in time [see API documentation](https://openweathermap.org/history#fields)
- Since we are interested in the maximum temperature experienced in Delhi, we use this field, along with relative humidity for our analysis

- Note that we use Google Drive for storing our data. If you prefer to use another file system/storage media, modify the two code blocks that follow to suit your needs

In [2]:
from google.colab import drive # ignore if not using Google Drive
drive.mount('/content/drive') # ignore if not using Google Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json

# Path to JSON file - modify this according to where your data is placed
file_path = '/content/drive/MyDrive/Socratus/OW_central_delhi_du_historical_weather.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    data = json.load(file)

In [4]:
# light exploration of the data
data[0]

{'dt': 283996800,
 'dt_iso': '1979-01-01 00:00:00 +0000 UTC',
 'timezone': 19800,
 'main': {'temp': 6.54,
  'temp_min': 5.76,
  'temp_max': 7.66,
  'feels_like': 5.14,
  'pressure': 1018,
  'humidity': 89,
  'dew_point': 4.86},
 'clouds': {'all': 0},
 'weather': [{'id': 800,
   'main': 'Clear',
   'description': 'sky is clear',
   'icon': '01n'}],
 'wind': {'speed': 2.02, 'deg': 306},
 'lon': 77.22,
 'city_name': 'Central Delhi (DU)',
 'lat': 28.69}

In [5]:
# light exploration of the data
data[-1]

{'dt': 1724972400,
 'dt_iso': '2024-08-29 23:00:00 +0000 UTC',
 'timezone': 19800,
 'main': {'temp': 26,
  'temp_min': 25.27,
  'temp_max': 27.12,
  'feels_like': 26,
  'pressure': 1005,
  'humidity': 94,
  'dew_point': 24.96},
 'clouds': {'all': 20},
 'weather': [{'id': 701,
   'main': 'Mist',
   'description': 'mist',
   'icon': '50n'}],
 'visibility': 2500,
 'wind': {'speed': 1.54, 'deg': 90},
 'lon': 77.22,
 'city_name': 'Central Delhi (DU)',
 'lat': 28.69}

In [6]:
from datetime import datetime, timedelta
import pandas as pd

# Load the data into a DataFrame and immediately normalize nested dictionaries
df = pd.json_normalize(data)

# Remove unnecessary columns
columns_to_keep = ['dt_iso', 'timezone', 'main.temp', 'main.temp_min', 'main.temp_max', 'main.humidity']
df = df[columns_to_keep]

# Convert dt_iso and timezone to local date and time
df['dt_iso'] = df['dt_iso'].str.replace(' UTC', '')  # Remove " UTC" from dt_iso
df['datetime'] = pd.to_datetime(df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z')  # Convert to datetime
df['local_datetime'] = df['datetime'] + pd.to_timedelta(df['timezone'], unit='s')  # Adjust with timezone
df['local_date'] = df['local_datetime'].dt.date  # Extract local date
df['local_time'] = df['local_datetime'].dt.time  # Extract local time

# Drop intermediate columns that are no longer needed
df = df.drop(columns=['dt_iso', 'datetime', 'local_datetime', 'timezone'])

# Rename columns for clarity
df.rename(columns={'main.temp': 'temp', 'main.temp_min': 'temp_min', 'main.temp_max': 'temp_max', 'main.humidity': 'humidity'}, inplace=True)

# View the final DataFrame with the desired columns
print(df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dt_iso'] = df['dt_iso'].str.replace(' UTC', '')  # Remove " UTC" from dt_iso


    temp  temp_min  temp_max  humidity  local_date local_time
0   6.54      5.76      7.66        89  1979-01-01   05:30:00
1   6.24      5.86      6.71        89  1979-01-01   06:30:00
2   6.09      5.80      6.45        88  1979-01-01   07:30:00
3  10.47     10.26     10.89        70  1979-01-01   08:30:00
4  18.20     17.83     18.65        57  1979-01-01   09:30:00


In [7]:
# Check if all values in 'temp' are within the range defined by 'temp_min' and 'temp_max'
((df['temp'] >= df['temp_min']) & (df['temp'] <= df['temp_max'])).all()

True

# Compute Heat Index based on Temperature and Relative Humidity
- We use the Python package `metpy` for this task
- The `metpy` package [states](https://unidata.github.io/MetPy/latest/api/generated/metpy.calc.heat_index.html):
"The implementation uses the formula outlined in [Rothfusz1990](https://unidata.github.io/MetPy/latest/api/references.html#rothfusz1990), which is a multi-variable least-squares regression of the values obtained in [Steadman1979](https://unidata.github.io/MetPy/latest/api/references.html#steadman1979). Additional conditional corrections are applied to match what the National Weather Service operationally uses. See Figure 3 of [Anderson2013](https://unidata.github.io/MetPy/latest/api/references.html#anderson2013) for a depiction of this algorithm and further discussion."
- It is well known that Heat Index does not apply for temperatures lower than $26.67°C$ or $80°F$; this is mentioned in `metpy` documentation as well
- The US government considers heat indices over $126$ (using temperature in $°F$) or $52.4$ (using temperature in $°C$) to be "extremely dangerous"; in the absence of any Delhi or India specific guidance around heat indices, we use the US guidance as a yardstick

In [8]:
!pip install metpy
import metpy



In [9]:
from metpy.calc import heat_index
from metpy.units import units
import numpy as np

# Initialize heat index column with NaN
df['heat_index'] = np.nan

# Create a mask for temperatures >= 26.67°C (80°F)
mask = df['temp_max'] >= 26.67

# Apply heat index calculation only for temperatures >= 26.67°C
valid_temps = df.loc[mask, 'temp_max'].values * units.degC
valid_humidity = df.loc[mask, 'humidity'].values * units.percent
df.loc[mask, 'heat_index'] = heat_index(valid_temps, valid_humidity).magnitude

In [10]:
heat_index(35.55 * units.degC, 70 * units.percent)

0,1
Magnitude,[52.405304219220284]
Units,degree_Celsius


In [11]:
df.describe()

Unnamed: 0,temp,temp_min,temp_max,humidity,heat_index
count,400272.0,400272.0,400272.0,400272.0,204677.0
mean,24.886713,24.443189,25.387065,62.535024,35.894135
std,8.28993,8.279948,8.285214,22.4204,5.508186
min,1.93,1.1,2.1,2.0,25.840052
25%,18.57,18.1,19.1,45.0,31.558733
50%,26.43,26.05,26.96,65.0,35.393983
75%,30.9,30.39,31.34,82.0,39.727909
max,46.64,46.42,48.1,100.0,84.850946


## Maximum Temperature Over the Years

In [12]:
# Max Daily Temperature and Heat Index - Delhi
max_daily_df = df.groupby('local_date').agg({
    'temp_max': 'max',
    'heat_index': 'max'
}).reset_index()

# Rename columns
max_daily_df.columns = ['local_date', 'max_temp_max', 'max_heat_index']

# Ensure 'local_date' is in datetime format
max_daily_df['local_date'] = pd.to_datetime(max_daily_df['local_date'])

# Display the result
print(max_daily_df)

      local_date  max_temp_max  max_heat_index
0     1979-01-01         22.96             NaN
1     1979-01-02         22.96             NaN
2     1979-01-03         23.78             NaN
3     1979-01-04         22.80             NaN
4     1979-01-05         22.73             NaN
...          ...           ...             ...
16674 2024-08-26         34.05       46.617969
16675 2024-08-27         33.50       47.019186
16676 2024-08-28         32.58       43.820219
16677 2024-08-29         31.09       46.212219
16678 2024-08-30         27.75       34.115906

[16679 rows x 3 columns]


In [13]:
!pip install plotly
import plotly.graph_objects as go



In [14]:
max_daily_df_sorted = max_daily_df.sort_values('local_date', ascending=True) # Sort by date

# Create a Plotly figure
fig = go.Figure()

# Add daily maximum temperature line plot with red color
fig.add_trace(go.Scatter(
    x=max_daily_df_sorted['local_date'],
    y=max_daily_df_sorted['max_temp_max'],
    mode='lines',
    name='Max Temp',
    line=dict(color='orange'),  # Change line color to orange
    hovertemplate='Date: %{x}<br>Max Temp: %{y:.2f}°C<extra></extra>'
))

# Add a horizontal span for heat waves (45°C to 50°C)
fig.add_shape(
    type='rect',
    x0=min(max_daily_df_sorted['local_date']), x1=max(max_daily_df_sorted['local_date']),
    y0=45, y1=50,
    line=dict(color='firebrick', width=0),  # No border for the span
    fillcolor='rgba(255,0,0,0.2)'  # Red with transparency
)

# Add annotation for heat wave, aligned to left
fig.add_annotation(
    x=datetime.strptime('1979-01-01', "%Y-%m-%d"),  # Align to the left of the plot
    y=47,
    text="HEAT WAVE (IMD Definition)",
    showarrow=False,
    font=dict(color="firebrick", size=12),
    xanchor='left',
    align='left'
)

# Add a dashed black rectangle to highlight the past decade (2014-2024)
rect_start_dt = datetime.strptime('2014-01-01', "%Y-%m-%d")
rect_end_dt = max_daily_df_sorted['local_date'].iloc[-1]

fig.add_shape(
    type="rect",
    x0=rect_start_dt, x1=rect_end_dt,
    y0=44, y1=48.5,
    line=dict(color='black', width=2, dash='dash'),  # Change rect line color to black with dashed style
    fillcolor='rgba(0,0,0,0)'  # No fill color
)

# Add annotation for the rectangle (2014-2024)
fig.add_annotation(
    x=datetime.strptime('2022-01-01', "%Y-%m-%d"),
    y=47.5,
    text="2014-2024",
    showarrow=False,
    font=dict(color="black", size=8),
    align='center'
)

# Update the layout of the figure
fig.update_layout(
    title='Daily Maximum Temperature<br>(from Jan 1, 1980 - Aug 29, 2024 - Delhi NCT)',
    xaxis_title='Date',
    yaxis_title='Maximum Temperature (°C)',
    yaxis=dict(range=[9, 49]),  # Set y-axis range from 9°C to 49°C
    xaxis=dict(
        range=[max_daily_df_sorted['local_date'].min(), max_daily_df_sorted['local_date'].max()],  # Ensure the line starts right
        tickformat='%d-%b-%Y',  # Display day-month-year on the x-axis
        tickangle=-45  # Rotate x-axis labels by 45 degrees
    ),
    hovermode='x unified'
)

# Display the interactive plot
fig.show()

In [15]:
import plotly.io as pio

# Save the plot as an interactive HTML file
pio.write_html(fig, file='daily_temp_max.html', full_html=True)

## Identifying Heat Wave Days (IMD definition) and Extreme Danger Days (US Govt. guidelines)

In [16]:
# Filter the Data for Heat Wave Days
heat_wave_days = max_daily_df[max_daily_df['max_temp_max'] >= 45].copy()

# Extract Year from 'local_date'
heat_wave_days.loc[:, 'Year'] = heat_wave_days['local_date'].dt.year

# Group by Year and Aggregate
heat_wave_summary = heat_wave_days.groupby('Year').agg(
    Number_of_Heat_Wave_Days=('local_date', 'count'),
    Dates=('local_date', lambda dates: ', '.join(dates.dt.strftime('%d-%m')))
).reset_index()
heat_wave_summary

Unnamed: 0,Year,Number_of_Heat_Wave_Days,Dates
0,1984,2,"24-05, 25-05"
1,1987,1,07-06
2,1988,6,"13-05, 26-05, 27-05, 28-05, 29-05, 30-05"
3,1993,1,11-06
4,1994,3,"29-05, 30-05, 08-06"
5,1995,12,"31-05, 01-06, 02-06, 03-06, 04-06, 05-06, 06-0..."
6,1998,9,"22-05, 23-05, 24-05, 25-05, 26-05, 27-05, 28-0..."
7,1999,1,01-05
8,2002,8,"09-05, 10-05, 11-05, 12-05, 14-05, 17-05, 18-0..."
9,2003,3,"02-06, 03-06, 04-06"


In [17]:
print(f"Heat Wave Days in 2024: {heat_wave_summary['Dates'][heat_wave_summary['Year'] == 2024].item()}")

Heat Wave Days in 2024: 17-05, 19-05, 26-05, 27-05, 28-05, 29-05, 30-05, 31-05, 12-06, 13-06, 15-06, 16-06, 17-06


In [18]:
# Filter the Data for Extreme Danger Days
ext_danger_days = max_daily_df[max_daily_df['max_heat_index'] >= 52.4].copy()

# Extract Year from 'local_date'
ext_danger_days.loc[:, 'Year'] = ext_danger_days['local_date'].dt.year

# Group by Year and Aggregate
ext_danger_summary = ext_danger_days.groupby('Year').agg(
    Number_of_Extreme_Danger_Days=('local_date', 'count'),
    Dates=('local_date', lambda dates: ', '.join(dates.dt.strftime('%d-%m')))
).reset_index()
ext_danger_summary

Unnamed: 0,Year,Number_of_Extreme_Danger_Days,Dates
0,1979,4,"14-05, 28-06, 29-06, 09-07"
1,1980,1,19-06
2,1981,2,"22-06, 23-06"
3,1982,3,"07-06, 07-07, 18-07"
4,1984,3,"29-05, 15-06, 17-06"
5,1985,2,"06-06, 29-06"
6,1986,3,"19-06, 20-06, 23-06"
7,1987,2,"07-06, 09-08"
8,1988,1,05-07
9,1989,4,"24-05, 13-06, 09-07, 12-07"


In [19]:
print(f"Extreme Danger Days in 2024: {ext_danger_summary['Dates'][ext_danger_summary['Year'] == 2024].item()}")

Extreme Danger Days in 2024: 22-05, 23-06, 11-07, 19-07, 23-07, 25-07, 27-07, 28-07, 30-07, 31-07, 15-08, 16-08, 17-08, 19-08, 24-08


In [20]:
# Separate the comma-separated 'Dates' into (mathematical) sets for each Year
heat_wave_summary['Heat_Wave_Dates'] = heat_wave_summary['Dates'].apply(
    lambda x: set(x.split(', ')) if pd.notna(x) else set())
ext_danger_summary['Extreme_Danger_Dates'] = ext_danger_summary['Dates'].apply(
    lambda x: set(x.split(', ')) if pd.notna(x) else set())

# Merge the two DataFrames on 'Year'
merged_df = pd.merge(heat_wave_summary[['Year', 'Heat_Wave_Dates']],
                     ext_danger_summary[['Year', 'Extreme_Danger_Dates']],
                     on='Year', how='outer')

# Calculate the number of unique Heat Wave and Extreme Danger days
def calculate_days(row):
    heat_wave_days = row['Heat_Wave_Dates'] if \
      pd.notna(row['Heat_Wave_Dates']) else set()
    extreme_danger_days = row['Extreme_Danger_Dates'] if \
      pd.notna(row['Extreme_Danger_Dates']) else set()

    # Ensure any day which is both a Heat Wave day and an Extreme Danger day is
    # counted only as only as one of the two; we choose to call such a day a
    # Heat Wave day alone
    heat_wave_only_days = heat_wave_days
    extreme_danger_only_days = extreme_danger_days - heat_wave_days

    return len(heat_wave_only_days), len(extreme_danger_only_days)

# Apply the logic to calculate the number of Heat Wave and Extreme Danger days
merged_df[['Heat_Wave_Days', 'Extreme_Danger_Days']] = \
  merged_df.apply(calculate_days, axis=1, result_type='expand')

In [21]:
fig = go.Figure()

# Add Heat Wave Days trace
fig.add_trace(go.Bar(
    x=merged_df['Year'],
    y=merged_df['Heat_Wave_Days'],
    name='Heat Wave Days',
    marker_color='orange',
    hovertemplate='<b>Year: %{x}</b><br>Heat Wave Days: %{y}<extra></extra>'
))

# Add Extreme Danger Days trace
fig.add_trace(go.Bar(
    x=merged_df['Year'],
    y=merged_df['Extreme_Danger_Days'],
    name='Extreme Danger Days',
    marker_color='red',
    hovertemplate='<b>Year: %{x}</b><br>Extreme Danger Days: %{y}<extra></extra>',
    offsetgroup=0,
    base=merged_df['Heat_Wave_Days']
))

# Add dashed rectangle covering 2014-2024
fig.add_shape(
    type='rect',
    x0=2013.5, x1=2024.5,  # Set the start and end of the rectangle for years
    y0=0, y1=max(merged_df['Heat_Wave_Days'] + merged_df['Extreme_Danger_Days']) + 1,  # Cover the y-axis range
    line=dict(color='black', width=2, dash='dash'),  # Dashed line style
    fillcolor='rgba(0,0,0,0)'  # No fill color
)

# Add the text "2014-2024" in the middle of the rectangle
fig.add_annotation(
    x=2022.5,
    y=max(merged_df['Heat_Wave_Days'] + merged_df['Extreme_Danger_Days']),  # Position above the bars
    text="2014-2024",
    showarrow=False,
    font=dict(color="black", size=8),
    align='left'
)

# Update layout
fig.update_layout(
    barmode='stack',
    title='Annual Number of Heat Wave and Extreme Danger Days per Year (Stacked)<br>(from Jan 1, 1980 - Aug 29, 2024 - Delhi NCT)',
    xaxis_title='Year',
    yaxis_title='Number of Heat Wave and Extreme Danger Days',
    hovermode="x unified",
    legend=dict(
        x=0.02,  # Horizontal position (from left)
        y=0.98,  # Vertical position (from top)
        bgcolor='rgba(255, 255, 255, 0.6)',  # Semi-transparent background for readability
        bordercolor='Black',
        borderwidth=1
    )
)

# Display the interactive plot
fig.show()

In [22]:
# Save the plot as an interactive HTML file
pio.write_html(fig, file='num_hw_ed_days_per_year.html', full_html=True)