In [32]:
import os
import pandas as pd

# Directory containing the CSV files
directory = 'draft-final-data'

# Dictionary to store the DataFrames
weather_stations = {}

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        
        # Store the DataFrame in the dictionary
        weather_stations[int(filename[:-4])] = df

# Display the keys of the dictionary to verify
print(weather_stations.keys())

dict_keys([72790024141, 72785524114, 72789094197, 72793024233, 72785794129, 72220804224, 72793494248, 72788594266, 72797624217, 72074924255, 72785024157, 72797094240, 72027294282, 72798594276, 72792424223, 74207124201, 72792894263, 72781024243, 72784624160, 72781524237, 72788324220, 72698824219, 72793894274, 74206024207, 72782724110, 72793724222, 72792594227, 72782594239, 72794504205, 72792394225, 72784524163, 72792024227, 72785694176, 72025400119])


In [33]:
# Pick one dataframe from the dictionary
sample_df = weather_stations[72790024141]

# Show the column names and their data types
print(sample_df.dtypes)

DATE                         object
STATION                       int64
DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation           object
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyVisibility             float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindGustSpeed          float64
DailyWindSpeed              float64
dtype: object


In [34]:
# Create a copy of the dictionary
weather_stations_trim = {key: df.copy() for key, df in weather_stations.items()}

# Drop the specified columns from the copied dictionary
for key in weather_stations_trim:
    weather_stations_trim[key] = weather_stations_trim[key].drop(columns=['STATION', 'DATE'])

# Verify the columns have been dropped
sample_df = weather_stations_trim[72790024141]
print(sample_df.dtypes)

DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation           object
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyVisibility             float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindGustSpeed          float64
DailyWindSpeed              float64
dtype: object


In [35]:
# Initialize sets to store unique values
unique_precipitation = set()
unique_weather_type = set()
unique_sky_conditions = set()
unique_wind_direction = set()

# Iterate through each dataframe in the dictionary
for key, df in weather_stations_trim.items():
    unique_precipitation.update(df['DailyPrecipitation'].unique())
    unique_weather_type.update(df['DailyPresentWeatherType'].unique())
    unique_sky_conditions.update(df['DailySkyConditions'].unique())
    unique_wind_direction.update(df['DailyWindDirection'].unique())

# Convert sets to lists for better readability
unique_precipitation = list(unique_precipitation)
unique_weather_type = list(unique_weather_type)
unique_sky_conditions = list(unique_sky_conditions)
unique_wind_direction = list(unique_wind_direction)

# Display the unique values
print("Unique DailyPrecipitation:", unique_precipitation)
print("Unique DailyPresentWeatherType:", unique_weather_type)
print("Unique DailySkyConditions:", unique_sky_conditions)
print("Unique DailyWindDirection:", unique_wind_direction)

Unique DailyPrecipitation: [0.135, 0.03125, 0.0381818181818181, 0.0525, '0.0', 0.0845454545454545, 0.0642857142857143, 0.0792857142857142, 0.105, 0.0385714285714285, 0.0392307692307692, 0.0194444444444444, 0.0474999999999999, 0.0325, 0.0111111111111111, 0.0933333333333333, 0.0225, 0.0592857142857142, 0.0185714285714285, '0.07', 0.0314285714285714, 0.075, 0.0833333333333333, '0.09', 0.0125, 0.065, 0.03375, 0.0633333333333333, 0.0699999999999999, 0.2, 0.0799999999999999, '0.04', 0.055, 0.0369230769230769, 0.057, 0.0553333333333333, 0.0616666666666666, 0.045, 0.0961538461538461, 0.0513333333333333, 0.047, 0.0433333333333333, 'T', '0.00', nan, 0.0333333333333333, 0.035, 0.0528571428571428, '0.06', 0.025, 0.023, 0.0319999999999999, 0.0379999999999999, 0.0233333333333333, 0.015, 0.04625, 0.0557142857142857, 0.0428571428571428, 0.0216666666666666, '0.16', '0.13', 0.0133333333333333, '0', 0.03625, 0.0328571428571428, 0.0457142857142857, 0.0357142857142857, 0.132, 0.0855555555555555, 0.0675, 0.

In [36]:
import numpy as np

def convert_to_radians(value):
    try:
        # Convert to int
        value_int = int(value)
        # Convert degrees to radians
        return np.deg2rad(value_int)
    except (ValueError, TypeError):
        # If conversion fails, return -1
        return -1

# Create a copy of the dictionary
ws_trim_stage_1 = {key: df.copy() for key, df in weather_stations_trim.items()}

# Apply the conversion to each dataframe in the copied dictionary
for key, df in ws_trim_stage_1.items():
    df['DailyWindDirection'] = df['DailyWindDirection'].apply(convert_to_radians)

# Verify the conversion
sample_df = ws_trim_stage_1[72790024141]
print(sample_df['DailyWindDirection'].head())

0    5.585054
1    6.283185
2    0.000000
3    6.283185
4    6.283185
Name: DailyWindDirection, dtype: float64


In [37]:
# Check for null values in the 'DailyPresentWeatherType' column for each dataframe
null_values = {key: df['DailyPresentWeatherType'].isnull().sum() for key, df in ws_trim_stage_1.items()}

# Display the results
display(null_values)

{72790024141: 0,
 72785524114: 0,
 72789094197: 0,
 72793024233: 0,
 72785794129: 0,
 72220804224: 0,
 72793494248: 0,
 72788594266: 0,
 72797624217: 0,
 72074924255: 0,
 72785024157: 0,
 72797094240: 0,
 72027294282: 0,
 72798594276: 0,
 72792424223: 0,
 74207124201: 0,
 72792894263: 1,
 72781024243: 0,
 72784624160: 0,
 72781524237: 0,
 72788324220: 0,
 72698824219: 7,
 72793894274: 0,
 74206024207: 0,
 72782724110: 0,
 72793724222: 0,
 72792594227: 0,
 72782594239: 0,
 72794504205: 0,
 72792394225: 0,
 72784524163: 0,
 72792024227: 0,
 72785694176: 0,
 72025400119: 0}

In [38]:
# Create a copy of ws_trim_stage_1
ws_trim_stage_2 = {key: df.copy() for key, df in ws_trim_stage_1.items()}

# Trim the number of entries to 658
for key in ws_trim_stage_2:
    ws_trim_stage_2[key] = ws_trim_stage_2[key].iloc[:658]

# Verify the trimming
sample_df = ws_trim_stage_2[72790024141]
print(sample_df.shape)

# Check for null values in each column for each dataframe in the trimmed dataframes
null_values_trimmed = {key: df.isnull().sum() for key, df in ws_trim_stage_2.items()}

# Display the results
for key, nulls in null_values_trimmed.items():
    print(f"Dataset {key}:")
    print(nulls)
    print()

(658, 16)
Dataset 72790024141:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
Dail

In [39]:
# Create a copy of ws_trim_stage_2
ws_trim_stage_3 = {key: df.copy() for key, df in ws_trim_stage_2.items()}

# Drop dataframes that contain one or more columns with all null values
ws_trim_stage_3 = {key: df for key, df in ws_trim_stage_3.items() if not df.isnull().all(axis=0).any()}

# Verify the result
# Dictionary to store the total null elements for each dataframe
null_elements_per_df = {}

for key, df in ws_trim_stage_3.items():
    print(f"Dataset {key}:", df.isnull().sum().sum())
    print(df.isnull().sum())
    print()


Dataset 72790024141: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
DailyWindS

In [40]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_3.items():
    # Filter rows with any null values
    null_entries = df[df.isnull().any(axis=1)]
    
    # Display the dataframe key and the null entries
    print(f"Dataset {key}:")
    display(null_entries)
    print()

Dataset 72790024141:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72785524114:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72789094197:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72793024233:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72785794129:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72793494248:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72788594266:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72797624217:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72074924255:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72785024157:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72797094240:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72798594276:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72792424223:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 74207124201:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72792894263:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72781024243:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72784624160:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72781524237:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed
596,,32.0,71.0,0.02,FG:2 |FG |,,,17.0,,CLR:00,,10.00,*,0.0,18.000000,3.333333
597,,32.0,71.0,0.02,FG:2 |FG |,,,17.0,,CLR:00,,10.00,*,0.0,18.000000,3.333333
598,,32.0,71.0,0.02,FG:2 |FG |,,,17.0,,CLR:00,,10.00,*,0.0,18.000000,3.333333
599,,32.0,71.0,0.02,FG:2 |FG |,,,17.0,,CLR:00,,10.00,*,0.0,18.000000,3.333333
600,,32.0,71.0,0.02,FG:2 |FG |,,,17.0,,CLR:00,,10.00,*,0.0,18.000000,3.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,,50.0,50.0,0.01,-SN:03 BR:1 |SN |,,,100.0,,OVC:08 2,,10.00,*,0.0,26.428571,1.355556
654,,34.0,34.0,0.01,-SN:03 BR:1 |SN |,,,100.0,,OVC:08 3,,10.00,*,-1.0,26.428571,4.423729
655,,33.0,33.0,0.01,BR:1 ||,,,100.0,,VV:09 2,,0.50,*,-1.0,26.428571,4.232558
656,,35.0,35.0,0.01,-RA:02 BR:1 |RA |RA,,,100.0,,OVC:08 4,,0.50,*,0.0,26.428571,3.531915



Dataset 72788324220:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72698824219:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72793894274:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 74206024207:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72782724110:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72793724222:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72792594227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72782594239:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72794504205:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72792394225:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72784524163:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72792024227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed



Dataset 72785694176:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyVisibility,DailyWetBulbTemperature,DailyWindDirection,DailyWindGustSpeed,DailyWindSpeed





In [41]:
# Create a copy of ws_trim_stage_3
ws_trim_stage_4 = {key: df.copy() for key, df in ws_trim_stage_3.items()}

# Remove any dataframe that contains null values
ws_trim_stage_4 = {key: df for key, df in ws_trim_stage_4.items() if not df.isnull().values.any()}

# Verify the result
for key, df in ws_trim_stage_4.items():
    print(f"Dataset {key} has no null values.")
    print(df.isnull().sum())
    print()

Dataset 72790024141 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyW

In [42]:
num_dataframes = len(ws_trim_stage_4)
print(f"There are {num_dataframes} dataframes in the dataset.")

total_entries = sum(len(df) for df in ws_trim_stage_4.values())
print(f"There are {total_entries} total entries in the dataset.")

There are 30 dataframes in the dataset.
There are 19740 total entries in the dataset.


In [43]:
# Create a copy of ws_trim_stage_4
ws_trim_stage_5 = {key: df.copy() for key, df in ws_trim_stage_4.items()}

# Split the 'DailyPresentWeatherType' column into three separate columns
for key, df in ws_trim_stage_5.items():
    df[['AU', 'AW', 'MW']] = df['DailyPresentWeatherType'].str.split('|', expand=True)

# Verify the result
sample_df = ws_trim_stage_5[72790024141]
print(sample_df[['DailyPresentWeatherType', 'AU', 'AW', 'MW']].head())

  DailyPresentWeatherType            AU   AW MW
0       -SN:03 BR:1 |SN |  -SN:03 BR:1   SN    
1       -SN:03 BR:1 |SN |  -SN:03 BR:1   SN    
2                 BR:1 ||         BR:1         
3            -SN:03 |SN |       -SN:03   SN    
4                 BR:1 ||         BR:1         


In [44]:
# Dictionary to store the unique value counts for each dataset
unique_counts = {}

# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_5.items():
    unique_counts[key] = {
        'AU': df['AU'].nunique(),
        'AW': df['AW'].nunique(),
        'MW': df['MW'].nunique()
    }

# Display the unique counts
for key, counts in unique_counts.items():
    print(f"Dataset {key}:")
    print(f"AU: {counts['AU']}, AW: {counts['AW']}, MW: {counts['MW']}")
    print()



Dataset 72790024141:
AU: 15, AW: 8, MW: 4

Dataset 72785524114:
AU: 22, AW: 13, MW: 7

Dataset 72789094197:
AU: 17, AW: 7, MW: 4

Dataset 72793024233:
AU: 18, AW: 10, MW: 7

Dataset 72785794129:
AU: 16, AW: 7, MW: 3

Dataset 72793494248:
AU: 12, AW: 7, MW: 5

Dataset 72788594266:
AU: 9, AW: 5, MW: 3

Dataset 72797624217:
AU: 12, AW: 6, MW: 4

Dataset 72074924255:
AU: 18, AW: 10, MW: 7

Dataset 72785024157:
AU: 28, AW: 19, MW: 10

Dataset 72797094240:
AU: 9, AW: 5, MW: 3

Dataset 72798594276:
AU: 10, AW: 6, MW: 3

Dataset 72792424223:
AU: 12, AW: 6, MW: 3

Dataset 74207124201:
AU: 16, AW: 8, MW: 5

Dataset 72792894263:
AU: 13, AW: 5, MW: 3

Dataset 72781024243:
AU: 15, AW: 8, MW: 5

Dataset 72784624160:
AU: 18, AW: 8, MW: 6

Dataset 72788324220:
AU: 13, AW: 7, MW: 3

Dataset 72698824219:
AU: 17, AW: 8, MW: 3

Dataset 72793894274:
AU: 12, AW: 6, MW: 3

Dataset 74206024207:
AU: 15, AW: 11, MW: 5

Dataset 72782724110:
AU: 17, AW: 9, MW: 5

Dataset 72793724222:
AU: 16, AW: 7, MW: 6

Dataset

In [45]:
# Create a copy of ws_trim_stage_5
ws_trim_stage_6 = {key: df.copy() for key, df in ws_trim_stage_5.items()}

# Function to perform one-hot encoding
def one_hot_encode(df, column):
    # Split the content by empty space
    split_data = df[column].str.split(expand=True)
    # Perform one-hot encoding
    one_hot_encoded = pd.get_dummies(split_data, prefix=column)
    # Drop the original column and concatenate the one-hot encoded columns
    df = df.drop(columns=[column])
    df = pd.concat([df, one_hot_encoded], axis=1)
    return df

# Apply the one-hot encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_6.items():
    df = one_hot_encode(df, 'AU')
    df = one_hot_encode(df, 'AW')
    df = one_hot_encode(df, 'MW')
    ws_trim_stage_6[key] = df

# Verify the result
sample_df = ws_trim_stage_6[72790024141]
print(sample_df.head())

   DailyAltimeterSetting  DailyDewPointTemperature  DailyDryBulbTemperature  \
0              30.094848                      26.0                28.545455   
1              29.956389                      26.0                30.166667   
2              29.946216                      29.0                29.918919   
3              29.991429                      27.0                33.250000   
4              29.757500                      28.0                36.333333   

  DailyPrecipitation DailyPresentWeatherType  DailyPressureChange  \
0                  0       -SN:03 BR:1 |SN |             -0.02125   
1                  0       -SN:03 BR:1 |SN |              0.02875   
2                  0                 BR:1 ||             -0.02125   
3                  0            -SN:03 |SN |              0.02125   
4                  0                 BR:1 ||              0.00250   

   DailyPressureTendency  DailyRelativeHumidity  DailySeaLevelPressure  \
0                  4.250            

In [46]:
# Collect all unique column names from all dataframes
all_columns = set()
for df in ws_trim_stage_6.values():
    all_columns.update(df.columns)

# Create a new dictionary to store the updated dataframes
ws_trim_stage_7 = {}

# Ensure each dataframe has all the collected columns, filling non-existent columns with False
for key, df in ws_trim_stage_6.items():
    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]
    # Reindex the dataframe to have all columns, filling missing columns with False
    df = df.reindex(columns=all_columns, fill_value=False)
    # Drop the 'DailyPresentWeatherType' column
    df = df.drop(columns=['DailyPresentWeatherType'])
    df = df.drop(columns=['DailySkyConditions'])
    # Move all columns that start with 'Daily' to the front
    daily_columns = [col for col in df.columns if col.startswith('Daily')]
    other_columns = [col for col in df.columns if not col.startswith('Daily')]
    df = df[daily_columns + other_columns]
    # Store the updated dataframe in the new dictionary
    ws_trim_stage_7[key] = df


# Verify the result
sample_df = ws_trim_stage_7[72790024141]
print(sample_df.head())

   DailyPressureTendency  DailyVisibility  DailyWindSpeed  \
0                  4.250             10.0             5.0   
1                  6.875             10.0             6.0   
2                  3.375             10.0             0.0   
3                  5.875             10.0            13.0   
4                  4.625             10.0            10.0   

   DailyWetBulbTemperature  DailyDryBulbTemperature  DailyStationPressure  \
0                27.515152                28.545455             28.750000   
1                28.611111                30.166667             28.616389   
2                28.918919                29.918919             28.606486   
3                31.000000                33.250000             28.651071   
4                33.541667                36.333333             28.426250   

   DailyAltimeterSetting  DailyWindDirection  DailyRelativeHumidity  \
0              30.094848            5.585054              87.757576   
1              29.956389    

In [47]:
# Iterate through each dataframe in the dictionary and print the number of columns
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key} has {df.shape[1]} columns.")

Dataset 72790024141 has 64 columns.
Dataset 72785524114 has 64 columns.
Dataset 72789094197 has 64 columns.
Dataset 72793024233 has 64 columns.
Dataset 72785794129 has 64 columns.
Dataset 72793494248 has 64 columns.
Dataset 72788594266 has 64 columns.
Dataset 72797624217 has 64 columns.
Dataset 72074924255 has 64 columns.
Dataset 72785024157 has 64 columns.
Dataset 72797094240 has 64 columns.
Dataset 72798594276 has 64 columns.
Dataset 72792424223 has 64 columns.
Dataset 74207124201 has 64 columns.
Dataset 72792894263 has 64 columns.
Dataset 72781024243 has 64 columns.
Dataset 72784624160 has 64 columns.
Dataset 72788324220 has 64 columns.
Dataset 72698824219 has 64 columns.
Dataset 72793894274 has 64 columns.
Dataset 74206024207 has 64 columns.
Dataset 72782724110 has 64 columns.
Dataset 72793724222 has 64 columns.
Dataset 72792594227 has 64 columns.
Dataset 72782594239 has 64 columns.
Dataset 72794504205 has 64 columns.
Dataset 72792394225 has 64 columns.
Dataset 72784524163 has 64 c

In [51]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key}:")
    
    # Count the number of null values in each dataframe
    null_count = df.isnull().sum().sum()
    print(f"Total null values: {null_count}")
    
    # Check if the dataframe contains a column whose type is not int, float, or bool
    valid_types = [np.dtype('int64'), np.dtype('float64'), np.dtype('bool')]
    invalid_types = df.dtypes[~df.dtypes.isin(valid_types)]
    if not invalid_types.empty:
        print("Columns with invalid types:")
        print(invalid_types)
    else:
        print("All columns have valid types (int, float, bool).")
    print()


Dataset 72790024141:
Total null values: 0
Columns with invalid types:
DailyPrecipitation    object
dtype: object

Dataset 72785524114:
Total null values: 0
Columns with invalid types:
DailyVisibility       object
DailyPrecipitation    object
dtype: object

Dataset 72789094197:
Total null values: 0
Columns with invalid types:
DailyPrecipitation    object
dtype: object

Dataset 72793024233:
Total null values: 0
Columns with invalid types:
DailyPrecipitation    object
dtype: object

Dataset 72785794129:
Total null values: 0
Columns with invalid types:
DailyPrecipitation    object
dtype: object

Dataset 72793494248:
Total null values: 0
Columns with invalid types:
DailyWetBulbTemperature     object
DailyDryBulbTemperature     object
DailyRelativeHumidity       object
DailyPrecipitation          object
DailyDewPointTemperature    object
dtype: object

Dataset 72788594266:
Total null values: 0
Columns with invalid types:
DailyPrecipitation    object
dtype: object

Dataset 72797624217:
Total 

In [None]:
def display_invalid_entries(dataframes):
    for key, df in dataframes.items():
        print(f"Dataset {key}:")
        
        # Select columns that are not of type int64, float64, or bool
        invalid_columns = df.select_dtypes(exclude=['int64', 'float64', 'bool']).columns
        
        # Display the entries with invalid types
        if not invalid_columns.empty:
            invalid_entries = df[invalid_columns]
            display(invalid_entries)
        else:
            print("All columns have valid types (int64, float64, bool).")
        print()

    # Call the function with ws_trim_stage_7
    display_invalid_entries(ws_trim_stage_7)

Dataset 72790024141:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72785524114:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,10,T
1,10,T
2,10,T
3,10,T
4,10,T
...,...,...
653,10.00,T
654,10.00,T
655,10.00,T
656,10.00,T



Dataset 72789094197:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72793024233:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72785794129:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72793494248:


Unnamed: 0,DailyWetBulbTemperature,DailyDryBulbTemperature,DailyRelativeHumidity,DailyPrecipitation,DailyDewPointTemperature
0,44.0,45.0,93.0,0,42.0
1,35.0,35.0,96.0,0,34.0
2,37.0,41.0,73.0,0,33.0
3,42.0,52.0,43.0,0,29.0
4,44.0,54.0,42.0,0,31.0
...,...,...,...,...,...
653,56,60,72,0.00,50
654,52,54,87,0.00,51
655,46,49,83,0.00,42
656,49,52,93,0.00,45



Dataset 72788594266:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,T



Dataset 72797624217:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72074924255:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,10,T
1,10,T
2,10,T
3,10,T
4,10,T
...,...,...
653,10.00,T
654,10.00,T
655,10.00,T
656,10.00,T



Dataset 72785024157:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,10,0
1,10,0
2,10,0
3,10,0
4,10,0
...,...,...
653,10.0,0.00
654,10.0,0.00
655,10.0,0.00
656,10.0,0.00



Dataset 72797094240:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0.01
3,0
4,T
...,...
653,0.00
654,0.00
655,0.00
656,0.02



Dataset 72798594276:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72792424223:


Unnamed: 0,DailyPrecipitation
0,T
1,T
2,T
3,T
4,T
...,...
653,0.01
654,0.02
655,0.01
656,T



Dataset 74207124201:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,10,T
1,10,T
2,10,T
3,10,T
4,10,0.01
...,...,...
653,10.00,T
654,10.00,T
655,10.00,0.02
656,10.00,0.01



Dataset 72792894263:


Unnamed: 0,DailyPrecipitation
0,0.01
1,0.01
2,0.01
3,0.01
4,0.01
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72781024243:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,T
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72784624160:


Unnamed: 0,DailyWetBulbTemperature,DailyDryBulbTemperature,DailyRelativeHumidity,DailyPrecipitation,DailyDewPointTemperature
0,31,31,96,0,30
1,29,30,96,0,28
2,29,30,92,0,28
3,29,29,96,0,28
4,31,31,96,0,30
...,...,...,...,...,...
653,59,71,58,0.00,49
654,53,55,87,0.00,51
655,43,48,68,0.00,38
656,45,43,47,0.00,33



Dataset 72788324220:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,0.25s,0
1,3,0
2,3,0
3,10,0
4,10,0
...,...,...
653,10.00,0.00
654,10.00,T
655,10.00,T
656,10.00,0.00



Dataset 72698824219:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72793894274:


Unnamed: 0,DailyWetBulbTemperature,DailyDryBulbTemperature,DailyPrecipitation
0,42.0,43,0
1,35.0,36,0
2,37.0,38,0
3,41.0,38,0
4,48.0,49,0
...,...,...,...
653,53,54,T
654,53,54,0.00
655,46,48,0.00
656,50,54,0.00



Dataset 74206024207:


Unnamed: 0,DailyVisibility,DailyPrecipitation
0,10,T
1,10,T
2,10,T
3,10,T
4,10,0.01
...,...,...
653,10.00,0.01
654,10.00,T
655,10.00,T
656,10.00,T



Dataset 72782724110:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72793724222:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,T
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72792594227:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0.01
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72782594239:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,T
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72794504205:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,0
3,0
4,0
...,...
653,0.00
654,0.01
655,0.00
656,T



Dataset 72792394225:


Unnamed: 0,DailyDryBulbTemperature,DailyStationPressure,DailyAltimeterSetting,DailyPrecipitation,DailyDewPointTemperature
0,41.0,30.08,30.1,0,37.0
1,51.0,29.61,29.63,0,36.0
2,51.0,29.75,29.77,0,39.0
3,44.0,29.87,29.89,0,41.0
4,51.0,29.34,29.36,0,46.0
...,...,...,...,...,...
653,54.0,30.04,30.06,0.00,54.0
654,55.0,29.98,30.0,0.00,47.0
655,46.0,29.9,29.92,0.00,45.0
656,51.0,30.11,30.13,0.02,48.0



Dataset 72784524163:


Unnamed: 0,DailyWindGustSpeed,DailyPrecipitation
0,24,0
1,24,0
2,24,0
3,24,0
4,24,0
...,...,...
653,26.0,0.00
654,26.0,0.00
655,16.0,0.00
656,23.0,0.00



Dataset 72792024227:


Unnamed: 0,DailyPrecipitation
0,0
1,0
2,T
3,0
4,0
...,...
653,0.00
654,0.00
655,0.00
656,0.00



Dataset 72785694176:


Unnamed: 0,DailyWindGustSpeed,DailyPrecipitation
0,20,0
1,20,0
2,20,0
3,20,0
4,20,0
...,...,...
653,20.0,0.00
654,20.0,0.00
655,20.0,0.00
656,20.0,0.00





In [54]:
# Create a copy of ws_trim_stage_7
ws_trim_stage_8 = {key: df.copy() for key, df in ws_trim_stage_7.items()}

# Function to encode 'DailyPrecipitation' values
def encode_precipitation(value):
    if value == 'T':
        return 0.05
    try:
        return float(value)
    except ValueError:
        return np.nan

# Apply the encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_8.items():
    df['DailyPrecipitation'] = df['DailyPrecipitation'].apply(encode_precipitation).astype('float64')

# Verify the result
sample_df = ws_trim_stage_8[72790024141]
print(sample_df['DailyPrecipitation'].dtype)

float64
