In [1]:
# ============================================================
# Weather Trend Forecasting: 01 Data Overview (Fixed)
# ============================================================

# Step 0: Fix module import path for 'src' folder
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
print("Project root added to sys.path:", project_root)

# Step 1: Imports
import pandas as pd
from IPython.display import display

# Step 2: Load Dataset
csv_path = "C:\\Users\\kumar\\OneDrive\\Desktop\\Pm accelerator\\weather-trend-forecast\\data\\raw\\GlobalWeatherRepository.csv"

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"Dataset not found at {csv_path}.")

df = pd.read_csv(csv_path)

# Step 2a: Convert 'last_updated' to datetime
df["last_updated"] = pd.to_datetime(df["last_updated"], errors="coerce")

# Warn if any invalid dates
num_missing_dates = df["last_updated"].isna().sum()
if num_missing_dates > 0:
    print(f"Warning: {num_missing_dates} rows have invalid 'last_updated' dates.")

# Step 2b: Ensure essential columns exist
required_columns = ["location_name", "country"]
for col in required_columns:
    if col not in df.columns:
        print(f"Warning: '{col}' column missing. Filling with 'Unknown'.")
        df[col] = "Unknown"

# Step 3: Display overview
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}\n")
display(df.head())

# Step 4: Dataset Info
print("\n--- Dataset Info ---")
df.info()

print("\n--- Dataset Description ---")
display(df.describe())

# Step 5: Dataset Coverage
print("\n--- Dataset Coverage ---")
print(f"Start date: {df['last_updated'].min()}")
print(f"End date: {df['last_updated'].max()}")
print(f"Number of locations: {df['location_name'].nunique()}")
print(f"Number of countries: {df['country'].nunique()}")

Project root added to sys.path: c:\Users\kumar\OneDrive\Desktop\Pm accelerator\weather-trend-forecast
Dataset loaded successfully!
Shape: (97629, 41)



Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,2024-05-16 13:15:00,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,04:50 AM,06:50 PM,12:12 PM,01:11 AM,Waxing Gibbous,55
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,2024-05-16 10:45:00,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,05:21 AM,07:54 PM,12:58 PM,02:14 AM,Waxing Gibbous,55
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,2024-05-16 09:45:00,23.0,73.4,Sunny,...,10.4,18.4,1,1,05:40 AM,07:50 PM,01:15 PM,02:14 AM,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,1715849100,2024-05-16 10:45:00,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,06:31 AM,09:11 PM,02:12 PM,03:31 AM,Waxing Gibbous,55
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,2024-05-16 09:45:00,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,06:12 AM,05:55 PM,01:17 PM,12:38 AM,Waxing Gibbous,55



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97629 entries, 0 to 97628
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   country                       97629 non-null  object        
 1   location_name                 97629 non-null  object        
 2   latitude                      97629 non-null  float64       
 3   longitude                     97629 non-null  float64       
 4   timezone                      97629 non-null  object        
 5   last_updated_epoch            97629 non-null  int64         
 6   last_updated                  97629 non-null  datetime64[ns]
 7   temperature_celsius           97629 non-null  float64       
 8   temperature_fahrenheit        97629 non-null  float64       
 9   condition_text                97629 non-null  object        
 10  wind_mph                      97629 non-null  float64       
 11  wind_k

Unnamed: 0,latitude,longitude,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,wind_mph,wind_kph,wind_degree,pressure_mb,...,gust_kph,air_quality_Carbon_Monoxide,air_quality_Ozone,air_quality_Nitrogen_dioxide,air_quality_Sulphur_dioxide,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,moon_illumination
count,97629.0,97629.0,97629.0,97629,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0,...,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0,97629.0
mean,19.146655,22.11187,1737585000.0,2025-01-23 00:38:03.965829888,22.786695,73.01778,8.239894,13.264101,170.879646,1013.971187,...,18.551144,517.11278,62.741021,15.813762,11.259342,26.146261,53.710302,1.760266,2.760399,49.698512
min,-41.3,-175.2,1715849000.0,2024-05-16 01:45:00,-24.9,-12.8,2.2,3.6,1.0,947.0,...,3.6,-9999.0,0.0,0.0,-9999.0,0.168,-1848.15,1.0,1.0,0.0
25%,3.75,-6.8361,1726743000.0,2024-09-19 16:30:00,18.1,64.6,4.0,6.5,83.0,1010.0,...,10.5,237.0,42.0,1.295,0.8,7.4,10.9,1.0,1.0,15.0
50%,17.25,23.3167,1737628000.0,2025-01-23 11:30:00,25.0,77.0,6.9,11.2,165.0,1013.0,...,15.9,327.45,60.0,4.995,2.405,15.17,22.1,1.0,2.0,49.0
75%,40.4,50.58,1748422000.0,2025-05-28 13:15:00,28.3,82.9,11.4,18.4,256.0,1018.0,...,24.4,507.4,79.0,17.945,9.065,29.97,46.065,2.0,3.0,84.0
max,64.15,179.22,1759219000.0,2025-09-30 21:00:00,49.2,120.6,1841.2,2963.2,360.0,3006.0,...,2970.4,38879.398,480.7,427.7,521.33,1614.1,6037.29,6.0,10.0,100.0
std,24.457074,65.819474,12547030.0,,8.891782,16.00502,7.876606,12.673783,102.722987,11.374869,...,14.675374,843.98154,32.248805,26.05942,40.768717,40.79936,164.028211,0.981358,2.566658,35.055113



--- Dataset Coverage ---
Start date: 2024-05-16 01:45:00
End date: 2025-09-30 21:00:00
Number of locations: 254
Number of countries: 211
