In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
file_path = "/content/drive/Shared drives/Hackathon/data/weather_data.csv"
df = pd.read_csv(file_path)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207264 entries, 0 to 207263
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt                   207264 non-null  int64  
 1   dt_iso               207264 non-null  object 
 2   timezone             207264 non-null  int64  
 3   city_name            207264 non-null  object 
 4   lat                  207264 non-null  float64
 5   lon                  207264 non-null  float64
 6   temp                 207264 non-null  float64
 7   visibility           0 non-null       float64
 8   dew_point            207264 non-null  float64
 9   feels_like           207264 non-null  float64
 10  temp_min             207264 non-null  float64
 11  temp_max             207264 non-null  float64
 12  pressure             207264 non-null  int64  
 13  sea_level            0 non-null       float64
 14  grnd_level           0 non-null       float64
 15  humidity         

In [5]:
print(df.head())

          dt                         dt_iso  timezone        city_name  \
0  915148800  1999-01-01 00:00:00 +0000 UTC      3600  Custom location   
1  915152400  1999-01-01 01:00:00 +0000 UTC      3600  Custom location   
2  915156000  1999-01-01 02:00:00 +0000 UTC      3600  Custom location   
3  915159600  1999-01-01 03:00:00 +0000 UTC      3600  Custom location   
4  915163200  1999-01-01 04:00:00 +0000 UTC      3600  Custom location   

         lat       lon  temp  visibility  dew_point  feels_like  ...  \
0  48.873492  2.295104  8.33         NaN       3.39        5.28  ...   
1  48.873492  2.295104  8.08         NaN       3.54        5.18  ...   
2  48.873492  2.295104  8.08         NaN       4.11        5.38  ...   
3  48.873492  2.295104  7.31         NaN       3.73        4.42  ...   
4  48.873492  2.295104  6.91         NaN       3.53        4.00  ...   

   wind_gust  rain_1h  rain_3h  snow_1h  snow_3h  clouds_all  weather_id  \
0        NaN      NaN      NaN      NaN      N

In [7]:
# Check the first few values of dt_iso before filtering
print("First few values of dt_iso before filtering:")
print(df["dt_iso"].head())

# Check if there are any valid dates in 2018-2022
valid_dates = df[(df["dt_iso"] >= "2018-01-01") & (df["dt_iso"] <= "2022-12-31")]
print(f"\nNumber of rows within 2018-2022: {len(valid_dates)}")


First few values of dt_iso before filtering:
0    1999-01-01 00:00:00 +0000 UTC
1    1999-01-01 01:00:00 +0000 UTC
2    1999-01-01 02:00:00 +0000 UTC
3    1999-01-01 03:00:00 +0000 UTC
4    1999-01-01 04:00:00 +0000 UTC
Name: dt_iso, dtype: object

Number of rows within 2018-2022: 40704


In [8]:
# Check data types
print("\nData type of dt_iso column:", df["dt_iso"].dtype)

# Check for missing or incorrect values
print("\nMissing values in dt_iso column:", df["dt_iso"].isnull().sum())
print("\nUnique formats in dt_iso:", df["dt_iso"].astype(str).str[-6:].unique())  # Check last 6 characters



Data type of dt_iso column: object

Missing values in dt_iso column: 0

Unique formats in dt_iso: ['00 UTC']


In [9]:
# Remove timezone information (if present)
df["dt_iso"] = df["dt_iso"].str.replace(r"\s\+\d{4} UTC", "", regex=True)

# Convert to datetime format
df["dt_iso"] = pd.to_datetime(df["dt_iso"], errors="coerce")

# Check if filtering works again
df_filtered = df[(df["dt_iso"] >= "2018-01-01") & (df["dt_iso"] <= "2022-12-31")]
print(f"\nNumber of rows after fixing dt_iso: {len(df_filtered)}")



Number of rows after fixing dt_iso: 40704


In [10]:
### Step 0: Filter dataset for 2018-2022 ###
import pandas as pd

# Ensure dt_iso column is a string before conversion
df["dt_iso"] = df["dt_iso"].astype(str)

# Convert dt_iso column to datetime, handling mixed formats automatically
df["dt_iso"] = pd.to_datetime(df["dt_iso"], format="mixed", errors="coerce")

# Define the start and end date for filtering
start_date = "2018-01-01"
end_date = "2022-12-31"

# Apply the filter
df_filtered = df[(df["dt_iso"] >= start_date) & (df["dt_iso"] <= end_date)]

# Print the first few rows of the filtered dataset
print(df_filtered.head())

                dt              dt_iso  timezone        city_name        lat  \
166560  1514764800 2018-01-01 00:00:00      3600  Custom location  48.873492   
166561  1514768400 2018-01-01 01:00:00      3600  Custom location  48.873492   
166562  1514772000 2018-01-01 02:00:00      3600  Custom location  48.873492   
166563  1514775600 2018-01-01 03:00:00      3600  Custom location  48.873492   
166564  1514779200 2018-01-01 04:00:00      3600  Custom location  48.873492   

             lon  temp  visibility  dew_point  feels_like  ...  wind_gust  \
166560  2.295104  8.94         NaN       4.56        5.21  ...        NaN   
166561  2.295104  8.07         NaN       3.72        4.09  ...        NaN   
166562  2.295104  7.76         NaN       3.42        3.81  ...        NaN   
166563  2.295104  7.57         NaN       3.43        3.63  ...        NaN   
166564  2.295104  6.86         NaN       3.30        2.88  ...        NaN   

        rain_1h  rain_3h  snow_1h  snow_3h  clouds_all  

In [13]:
### Clear out the useless column
# check for the unique value first
unique_values = df_filtered["snow_3h"].unique()
print(unique_values)

[nan]


In [None]:
# Define the list of columns to remove
columns_to_remove = ["timezone", "city_name", "lat", "lon", "sea_level", "grnd_level"]  # Replace with actual column names

# Drop the specified columns
df_filtered = df_filtered.drop(columns=columns_to_remove)

# Print the first few rows to verify
print(df_filtered.head())

                dt              dt_iso  temp  visibility  dew_point  \
166560  1514764800 2018-01-01 00:00:00  8.94         NaN       4.56   
166561  1514768400 2018-01-01 01:00:00  8.07         NaN       3.72   
166562  1514772000 2018-01-01 02:00:00  7.76         NaN       3.42   
166563  1514775600 2018-01-01 03:00:00  7.57         NaN       3.43   
166564  1514779200 2018-01-01 04:00:00  6.86         NaN       3.30   

        feels_like  temp_min  temp_max  pressure  humidity  ...  wind_gust  \
166560        5.21      7.70      9.60      1007        74  ...        NaN   
166561        4.09      7.70      8.60      1008        74  ...        NaN   
166562        3.81      7.31      7.94      1009        74  ...        NaN   
166563        3.63      6.74      7.94      1010        75  ...        NaN   
166564        2.88      5.74      7.60      1010        78  ...        NaN   

        rain_1h  rain_3h  snow_1h  snow_3h  clouds_all  weather_id  \
166560      NaN      NaN      NaN 

In [None]:
# Mapping of weather_icon codes to descriptions
weather_icon_map = {
    "01d": "Clear sky (day)",
    "01n": "Clear sky (night)",
    "02d": "Few clouds (day)",
    "02n": "Few clouds (night)",
    "03d": "Scattered clouds",
    "03n": "Scattered clouds (night)",
    "04d": "Broken clouds",
    "04n": "Broken clouds (night)",
    "09d": "Shower rain",
    "09n": "Shower rain (night)",
    "10d": "Rain",
    "10n": "Rain (night)",
    "11d": "Thunderstorm",
    "11n": "Thunderstorm (night)",
    "13d": "Snow",
    "13n": "Snow (night)",
    "50d": "Mist/Fog",
    "50n": "Mist/Fog (night)"
}

# Apply the mapping to create a new column with descriptions
df_filtered["weather_description_mapped"] = df_filtered["weather_icon"].map(weather_icon_map)

# Print first few rows to verify
print(df_filtered[["weather_icon", "weather_description_mapped"]].head())

       weather_icon weather_description_mapped
166560          02n         Few clouds (night)
166561          01n          Clear sky (night)
166562          03n   Scattered clouds (night)
166563          04n      Broken clouds (night)
166564          04n      Broken clouds (night)


In [None]:
### Step 1: Check for missing values ###
missing_counts = df_filtered.isnull().sum()
total_missing = missing_counts.sum()
print("Missing values per column before handling:\n", missing_counts)
print(f"Total missing values: {total_missing}")

Missing values per column before handling:
 dt                                0
dt_iso                            0
temp                              0
visibility                    40704
dew_point                         0
feels_like                        0
temp_min                          0
temp_max                          0
pressure                          0
humidity                          0
wind_speed                        0
wind_deg                          0
wind_gust                     40464
rain_1h                       35042
rain_3h                       40677
snow_1h                       40484
snow_3h                       40704
clouds_all                        0
weather_id                        0
weather_main                      0
weather_description               0
weather_icon                      0
weather_description_mapped        0
dtype: int64
Total missing values: 238075


In [None]:
### Step 2: Check for missing dates ###
# Convert date_col to datetime
date_col = "dt_iso"
df_filtered[date_col] = pd.to_datetime(df_filtered[date_col])

# Detect Missing Dates ###
# Generate the full expected date range
date_range = pd.date_range(start=df_filtered[date_col].min(), end=df_filtered[date_col].max())

# Find the missing dates
existing_dates = set(df_filtered[date_col])
missing_dates = sorted(set(date_range) - existing_dates)

# Print missing dates info
print(f"Total missing dates: {len(missing_dates)}")
if missing_dates:
    print(pd.DataFrame({"Missing Dates": missing_dates}).to_string(index=False))  # Print all missing dates

Total missing dates: 0


In [None]:
### Step 3: negative value

In [None]:
print(df_filtered.info())

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 166560 to 207263
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   dt                          40704 non-null  int64         
 1   dt_iso                      40704 non-null  datetime64[ns]
 2   temp                        40704 non-null  float64       
 3   visibility                  0 non-null      float64       
 4   dew_point                   40704 non-null  float64       
 5   feels_like                  40704 non-null  float64       
 6   temp_min                    40704 non-null  float64       
 7   temp_max                    40704 non-null  float64       
 8   pressure                    40704 non-null  int64         
 9   humidity                    40704 non-null  int64         
 10  wind_speed                  40704 non-null  float64       
 11  wind_deg                    40704 non-null  int64    

In [None]:
# Display the cleaned dataset
print("Cleaned Dataset:")
print(df_filtered.head())

Cleaned Dataset:
                dt              dt_iso  temp  visibility  dew_point  \
166560  1514764800 2018-01-01 00:00:00  8.94         NaN       4.56   
166561  1514768400 2018-01-01 01:00:00  8.07         NaN       3.72   
166562  1514772000 2018-01-01 02:00:00  7.76         NaN       3.42   
166563  1514775600 2018-01-01 03:00:00  7.57         NaN       3.43   
166564  1514779200 2018-01-01 04:00:00  6.86         NaN       3.30   

        feels_like  temp_min  temp_max  pressure  humidity  ...  rain_1h  \
166560        5.21      7.70      9.60      1007        74  ...      NaN   
166561        4.09      7.70      8.60      1008        74  ...      NaN   
166562        3.81      7.31      7.94      1009        74  ...      NaN   
166563        3.63      6.74      7.94      1010        75  ...      NaN   
166564        2.88      5.74      7.60      1010        78  ...      NaN   

        rain_3h  snow_1h  snow_3h  clouds_all  weather_id  weather_main  \
166560      NaN      NaN

In [None]:
### Step 5: Export the cleaned dataset ###
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/weather_data.csv"
df_filtered.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/weather_data.csv
