In [111]:
import os
import pandas as pd

# Directory containing the CSV files
directory = 'draft-final-data-2'

# Dictionary to store the DataFrames
weather_stations = {}

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        
        # Store the DataFrame in the dictionary
        weather_stations[int(filename[:-4])] = df

# Display the keys of the dictionary to verify
print(weather_stations.keys())

dict_keys([72790024141, 72785524114, 72789094197, 72793024233, 72785794129, 72788594266, 72797624217, 72785024157, 72797094240, 72798594276, 72792424223, 72792894263, 72781024243, 72781524237, 72788324220, 72698824219, 72793894274, 74206024207, 72782724110, 72793724222, 72792594227, 72782594239, 72794504205, 72792394225, 72784524163, 72792024227, 72785694176])


In [112]:
# Pick one dataframe from the dictionary
sample_df = weather_stations[72790024141]

# Show the column names and their data types
print(sample_df.dtypes)

DATE                         object
STATION                       int64
DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation          float64
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindSpeed              float64
dtype: object


In [113]:
# Create a copy of the dictionary
weather_stations_trim = {key: df.copy() for key, df in weather_stations.items()}

# Drop the specified columns from the copied dictionary
for key in weather_stations_trim:
    weather_stations_trim[key] = weather_stations_trim[key].drop(columns=['STATION', 'DATE'])

# Verify the columns have been dropped
sample_df = weather_stations_trim[72790024141]
print(sample_df.dtypes)

DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation          float64
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindSpeed              float64
dtype: object


In [114]:
# Initialize sets to store unique values
unique_precipitation = set()
unique_weather_type = set()
unique_sky_conditions = set()
unique_wind_direction = set()

# Iterate through each dataframe in the dictionary
for key, df in weather_stations_trim.items():
    unique_precipitation.update(df['DailyPrecipitation'].unique())
    unique_weather_type.update(df['DailyPresentWeatherType'].unique())
    unique_sky_conditions.update(df['DailySkyConditions'].unique())
    unique_wind_direction.update(df['DailyWindDirection'].unique())

# Convert sets to lists for better readability
unique_precipitation = list(unique_precipitation)
unique_weather_type = list(unique_weather_type)
unique_sky_conditions = list(unique_sky_conditions)
unique_wind_direction = list(unique_wind_direction)

# Display the unique values
print("Unique DailyPrecipitation:", unique_precipitation)
print("Unique DailyPresentWeatherType:", unique_weather_type)
print("Unique DailySkyConditions:", unique_sky_conditions)
print("Unique DailyWindDirection:", unique_wind_direction)

Unique DailyPrecipitation: [0.0, 0.015625, 0.03125, 0.00390625, 0.0378947368421052, 0.0094736842105263, 0.0189473684210526, 0.0086666666666666, 0.0625, 0.0117647058823529, 0.0093103448275862, 0.0235483870967741, 0.0201190476190476, 0.0039583333333333, 0.0282499999999999, 0.1133333333333333, 0.0195833333333333, 0.0035185185185185, 0.0134821428571428, 0.0255813953488371, 0.0202702702702702, 0.014875, 0.0391666666666666, 0.0305, 0.0157317073170731, 0.0079166666666666, 0.0358974358974358, 0.0285714285714285, 0.0010869565217391, 0.0179761904761904, 0.0144594594594594, 0.0073809523809523, 0.0187037037037037, 0.0352272727272727, 0.0366, 0.0204651162790697, 0.0086486486486486, 0.0188333333333333, 0.0118749999999999, 0.0307692307692307, 0.0583898305084745, 0.0180263157894736, 0.0313636363636363, 0.0391935483870967, 0.015735294117647, 0.007037037037037, 0.0357446808510638, 0.0331578947368421, 0.0274242424242424, 0.0521052631578947, 0.0222222222222222, 0.0071666666666666, 0.0007471264367816, 0.00

In [115]:
import numpy as np

def convert_to_radians(value):
    try:
        # Convert to int
        value_int = int(value)
        # Convert degrees to radians
        return np.deg2rad(value_int)
    except (ValueError, TypeError):
        # If conversion fails, return -1
        return -1

# Create a copy of the dictionary
ws_trim_stage_1 = {key: df.copy() for key, df in weather_stations_trim.items()}

# Apply the conversion to each dataframe in the copied dictionary
for key, df in ws_trim_stage_1.items():
    df['DailyWindDirection'] = df['DailyWindDirection'].apply(convert_to_radians)

# Verify the conversion
sample_df = ws_trim_stage_1[72790024141]
print(sample_df['DailyWindDirection'].head())

0    5.585054
1    6.283185
2    0.000000
3    6.283185
4    6.283185
Name: DailyWindDirection, dtype: float64


In [116]:
# Check for null values in the 'DailyPresentWeatherType' column for each dataframe
null_values = {key: df['DailyPresentWeatherType'].isnull().sum() for key, df in ws_trim_stage_1.items()}

# Display the results
display(null_values)

{72790024141: 0,
 72785524114: 0,
 72789094197: 0,
 72793024233: 0,
 72785794129: 0,
 72788594266: 0,
 72797624217: 0,
 72785024157: 0,
 72797094240: 0,
 72798594276: 0,
 72792424223: 0,
 72792894263: 0,
 72781024243: 0,
 72781524237: 0,
 72788324220: 0,
 72698824219: 0,
 72793894274: 0,
 74206024207: 0,
 72782724110: 0,
 72793724222: 0,
 72792594227: 0,
 72782594239: 0,
 72794504205: 0,
 72792394225: 0,
 72784524163: 0,
 72792024227: 0,
 72785694176: 0}

In [117]:
# Create a copy of ws_trim_stage_1
ws_trim_stage_2 = {key: df.copy() for key, df in ws_trim_stage_1.items()}

# Trim the number of entries to 658
for key in ws_trim_stage_2:
    ws_trim_stage_2[key] = ws_trim_stage_2[key].iloc[:658]

# Verify the trimming
sample_df = ws_trim_stage_2[72790024141]
print(sample_df.shape)

# Check for null values in each column for each dataframe in the trimmed dataframes
null_values_trimmed = {key: df.isnull().sum() for key, df in ws_trim_stage_2.items()}

# Display the results
for key, nulls in null_values_trimmed.items():
    print(f"Dataset {key}:")
    print(nulls)
    print()

(658, 14)
Dataset 72790024141:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197:
DailyAltimeterSetting       0
DailyDewPointTemperature    0

In [118]:
# Create a copy of ws_trim_stage_2
ws_trim_stage_3 = {key: df.copy() for key, df in ws_trim_stage_2.items()}

# Drop dataframes that contain one or more columns with all null values
ws_trim_stage_3 = {key: df for key, df in ws_trim_stage_3.items() if not df.isnull().all(axis=0).any()}

# Verify the result
# Dictionary to store the total null elements for each dataframe
null_elements_per_df = {}

for key, df in ws_trim_stage_3.items():
    print(f"Dataset {key}:", df.isnull().sum().sum())
    print(df.isnull().sum())
    print()


Dataset 72790024141: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
Dai

In [119]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_3.items():
    # Filter rows with any null values
    null_entries = df[df.isnull().any(axis=1)]
    
    # Display the dataframe key and the null entries
    print(f"Dataset {key}:")
    display(null_entries)
    print()

Dataset 72790024141:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785524114:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72789094197:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793024233:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785794129:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72788594266:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72797624217:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785024157:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72797094240:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72798594276:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792424223:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792894263:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72781024243:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72781524237:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72788324220:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72698824219:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793894274:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 74206024207:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72782724110:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793724222:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792594227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72782594239:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72794504205:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792394225:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72784524163:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792024227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785694176:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed





In [120]:
# Create a copy of ws_trim_stage_3
ws_trim_stage_4 = {key: df.copy() for key, df in ws_trim_stage_3.items()}

# Remove any dataframe that contains null values
ws_trim_stage_4 = {key: df for key, df in ws_trim_stage_4.items() if not df.isnull().values.any()}

# Verify the result
for key, df in ws_trim_stage_4.items():
    print(f"Dataset {key} has no null values.")
    print(df.isnull().sum())
    print()

Dataset 72790024141 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197 has no null values.
DailyAltimet

In [121]:
num_dataframes = len(ws_trim_stage_4)
print(f"There are {num_dataframes} dataframes in the dataset.")

total_entries = sum(len(df) for df in ws_trim_stage_4.values())
print(f"There are {total_entries} total entries in the dataset.")

There are 27 dataframes in the dataset.
There are 17766 total entries in the dataset.


In [122]:
# Create a copy of ws_trim_stage_4
ws_trim_stage_5 = {key: df.copy() for key, df in ws_trim_stage_4.items()}

# Split the 'DailyPresentWeatherType' column into three separate columns
for key, df in ws_trim_stage_5.items():
    df[['AU', 'AW', 'MW']] = df['DailyPresentWeatherType'].str.split('|', expand=True)

# Verify the result
sample_df = ws_trim_stage_5[72790024141]
print(sample_df[['DailyPresentWeatherType', 'AU', 'AW', 'MW']].head())

  DailyPresentWeatherType            AU   AW MW
0       -SN:03 BR:1 |SN |  -SN:03 BR:1   SN    
1       -SN:03 BR:1 |SN |  -SN:03 BR:1   SN    
2                 BR:1 ||         BR:1         
3            -SN:03 |SN |       -SN:03   SN    
4                 BR:1 ||         BR:1         


In [123]:
# Dictionary to store the unique value counts for each dataset
unique_counts = {}

# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_5.items():
    unique_counts[key] = {
        'AU': df['AU'].nunique(),
        'AW': df['AW'].nunique(),
        'MW': df['MW'].nunique()
    }

# Display the unique counts
for key, counts in unique_counts.items():
    print(f"Dataset {key}:")
    print(f"AU: {counts['AU']}, AW: {counts['AW']}, MW: {counts['MW']}")
    print()



Dataset 72790024141:
AU: 15, AW: 8, MW: 4

Dataset 72785524114:
AU: 22, AW: 13, MW: 7

Dataset 72789094197:
AU: 17, AW: 7, MW: 4

Dataset 72793024233:
AU: 18, AW: 10, MW: 7

Dataset 72785794129:
AU: 16, AW: 7, MW: 3

Dataset 72788594266:
AU: 9, AW: 5, MW: 3

Dataset 72797624217:
AU: 12, AW: 6, MW: 4

Dataset 72785024157:
AU: 28, AW: 19, MW: 10

Dataset 72797094240:
AU: 9, AW: 5, MW: 3

Dataset 72798594276:
AU: 10, AW: 6, MW: 3

Dataset 72792424223:
AU: 12, AW: 6, MW: 3

Dataset 72792894263:
AU: 13, AW: 5, MW: 3

Dataset 72781024243:
AU: 15, AW: 8, MW: 5

Dataset 72781524237:
AU: 21, AW: 10, MW: 4

Dataset 72788324220:
AU: 13, AW: 7, MW: 3

Dataset 72698824219:
AU: 17, AW: 8, MW: 3

Dataset 72793894274:
AU: 12, AW: 6, MW: 3

Dataset 74206024207:
AU: 15, AW: 11, MW: 5

Dataset 72782724110:
AU: 17, AW: 9, MW: 5

Dataset 72793724222:
AU: 16, AW: 7, MW: 6

Dataset 72792594227:
AU: 13, AW: 5, MW: 3

Dataset 72782594239:
AU: 14, AW: 7, MW: 4

Dataset 72794504205:
AU: 11, AW: 5, MW: 3

Dataset

In [124]:
# Create a copy of ws_trim_stage_5
ws_trim_stage_6 = {key: df.copy() for key, df in ws_trim_stage_5.items()}

# Function to perform one-hot encoding
def one_hot_encode(df, column):
    # Split the content by empty space
    split_data = df[column].str.split(expand=True)
    # Perform one-hot encoding
    one_hot_encoded = pd.get_dummies(split_data, prefix=column)
    # Drop the original column and concatenate the one-hot encoded columns
    df = df.drop(columns=[column])
    df = pd.concat([df, one_hot_encoded], axis=1)
    return df

# Apply the one-hot encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_6.items():
    df = one_hot_encode(df, 'AU')
    df = one_hot_encode(df, 'AW')
    df = one_hot_encode(df, 'MW')
    ws_trim_stage_6[key] = df

# Verify the result
sample_df = ws_trim_stage_6[72790024141]
print(sample_df.head())

   DailyAltimeterSetting  DailyDewPointTemperature  DailyDryBulbTemperature  \
0              30.094848                 25.363636                28.545455   
1              29.956389                 25.527778                30.166667   
2              29.946216                 27.189189                29.918919   
3              29.991429                 27.392857                33.250000   
4              29.757500                 29.125000                36.333333   

   DailyPrecipitation DailyPresentWeatherType  DailyPressureChange  \
0            0.000000       -SN:03 BR:1 |SN |             -0.02125   
1            0.001111       -SN:03 BR:1 |SN |              0.02875   
2            0.000000                 BR:1 ||             -0.02125   
3            0.000962            -SN:03 |SN |              0.02125   
4            0.000000                 BR:1 ||              0.00250   

   DailyPressureTendency  DailyRelativeHumidity  DailySeaLevelPressure  \
0                  4.250      

In [125]:
# Collect all unique column names from all dataframes
all_columns = set()
for df in ws_trim_stage_6.values():
    all_columns.update(df.columns)

# Create a new dictionary to store the updated dataframes
ws_trim_stage_7 = {}

# Ensure each dataframe has all the collected columns, filling non-existent columns with False
for key, df in ws_trim_stage_6.items():
    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]
    # Reindex the dataframe to have all columns, filling missing columns with False
    df = df.reindex(columns=all_columns, fill_value=False)
    # Drop the 'DailyPresentWeatherType' column
    df = df.drop(columns=['DailyPresentWeatherType'])
    df = df.drop(columns=['DailySkyConditions'])
    # Move all columns that start with 'Daily' to the front
    daily_columns = [col for col in df.columns if col.startswith('Daily')]
    other_columns = [col for col in df.columns if not col.startswith('Daily')]
    df = df[daily_columns + other_columns]
    # Store the updated dataframe in the new dictionary
    ws_trim_stage_7[key] = df


# Verify the result
sample_df = ws_trim_stage_7[72790024141]
print(sample_df.head())

   DailyWindDirection  DailyDewPointTemperature  DailyStationPressure  \
0            5.585054                 25.363636             28.750000   
1            6.283185                 25.527778             28.616389   
2            0.000000                 27.189189             28.606486   
3            6.283185                 27.392857             28.651071   
4            6.283185                 29.125000             28.426250   

   DailyWindSpeed  DailyPressureTendency  DailyAltimeterSetting  \
0        7.454545                  4.250              30.094848   
1        6.944444                  6.875              29.956389   
2        1.702703                  3.375              29.946216   
3       11.571429                  5.875              29.991429   
4       10.583333                  4.625              29.757500   

   DailyWetBulbTemperature  DailyDryBulbTemperature  DailySeaLevelPressure  \
0                27.515152                28.545455              30.141667   
1 

In [126]:
# Iterate through each dataframe in the dictionary and print the number of columns
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key} has {df.shape[1]} columns.")

Dataset 72790024141 has 59 columns.
Dataset 72785524114 has 59 columns.
Dataset 72789094197 has 59 columns.
Dataset 72793024233 has 59 columns.
Dataset 72785794129 has 59 columns.
Dataset 72788594266 has 59 columns.
Dataset 72797624217 has 59 columns.
Dataset 72785024157 has 59 columns.
Dataset 72797094240 has 59 columns.
Dataset 72798594276 has 59 columns.
Dataset 72792424223 has 59 columns.
Dataset 72792894263 has 59 columns.
Dataset 72781024243 has 59 columns.
Dataset 72781524237 has 59 columns.
Dataset 72788324220 has 59 columns.
Dataset 72698824219 has 59 columns.
Dataset 72793894274 has 59 columns.
Dataset 74206024207 has 59 columns.
Dataset 72782724110 has 59 columns.
Dataset 72793724222 has 59 columns.
Dataset 72792594227 has 59 columns.
Dataset 72782594239 has 59 columns.
Dataset 72794504205 has 59 columns.
Dataset 72792394225 has 59 columns.
Dataset 72784524163 has 59 columns.
Dataset 72792024227 has 59 columns.
Dataset 72785694176 has 59 columns.


In [127]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key}:")
    
    # Count the number of null values in each dataframe
    null_count = df.isnull().sum().sum()
    print(f"Total null values: {null_count}")
    
    # Check if the dataframe contains a column whose type is not int, float, or bool
    valid_types = [np.dtype('int64'), np.dtype('float64'), np.dtype('bool')]
    invalid_types = df.dtypes[~df.dtypes.isin(valid_types)]
    if not invalid_types.empty:
        print("Columns with invalid types:")
        print(invalid_types)
    else:
        print("All columns have valid types (int, float, bool).")
    print()


Dataset 72790024141:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785524114:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72789094197:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72793024233:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785794129:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72788594266:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72797624217:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785024157:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72797094240:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72798594276:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72792424223:
Total null values: 0
All columns have valid types (int, flo

In [128]:
def display_invalid_entries(dataframes):
    for key, df in dataframes.items():
        print(f"Dataset {key}:")
        
        # Select columns that are not of type int64, float64, or bool
        invalid_columns = df.select_dtypes(exclude=['int64', 'float64', 'bool']).columns
        
        # Display the entries with invalid types
        if not invalid_columns.empty:
            invalid_entries = df[invalid_columns]
            display(invalid_entries)
        else:
            print("All columns have valid types (int64, float64, bool).")
        print()

    # Call the function with ws_trim_stage_7
display_invalid_entries(ws_trim_stage_7)

Dataset 72790024141:
All columns have valid types (int64, float64, bool).

Dataset 72785524114:
All columns have valid types (int64, float64, bool).

Dataset 72789094197:
All columns have valid types (int64, float64, bool).

Dataset 72793024233:
All columns have valid types (int64, float64, bool).

Dataset 72785794129:
All columns have valid types (int64, float64, bool).

Dataset 72788594266:
All columns have valid types (int64, float64, bool).

Dataset 72797624217:
All columns have valid types (int64, float64, bool).

Dataset 72785024157:
All columns have valid types (int64, float64, bool).

Dataset 72797094240:
All columns have valid types (int64, float64, bool).

Dataset 72798594276:
All columns have valid types (int64, float64, bool).

Dataset 72792424223:
All columns have valid types (int64, float64, bool).

Dataset 72792894263:
All columns have valid types (int64, float64, bool).

Dataset 72781024243:
All columns have valid types (int64, float64, bool).

Dataset 72781524237:
All 

In [129]:
# Create a copy of ws_trim_stage_7
ws_trim_stage_8 = {key: df.copy() for key, df in ws_trim_stage_7.items()}

# Function to encode 'DailyPrecipitation' values
def encode_precipitation(value):
    if value == 'T':
        return 0.05
    try:
        return float(value)
    except ValueError:
        return np.nan

# Apply the encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_8.items():
    df['DailyPrecipitation'] = df['DailyPrecipitation'].apply(encode_precipitation).astype('float64')

# Verify the result
sample_df = ws_trim_stage_8[72790024141]
print(sample_df['DailyPrecipitation'].dtype)

float64


In [130]:
display_invalid_entries(ws_trim_stage_8)

Dataset 72790024141:
All columns have valid types (int64, float64, bool).

Dataset 72785524114:
All columns have valid types (int64, float64, bool).

Dataset 72789094197:
All columns have valid types (int64, float64, bool).

Dataset 72793024233:
All columns have valid types (int64, float64, bool).

Dataset 72785794129:
All columns have valid types (int64, float64, bool).

Dataset 72788594266:
All columns have valid types (int64, float64, bool).

Dataset 72797624217:
All columns have valid types (int64, float64, bool).

Dataset 72785024157:
All columns have valid types (int64, float64, bool).

Dataset 72797094240:
All columns have valid types (int64, float64, bool).

Dataset 72798594276:
All columns have valid types (int64, float64, bool).

Dataset 72792424223:
All columns have valid types (int64, float64, bool).

Dataset 72792894263:
All columns have valid types (int64, float64, bool).

Dataset 72781024243:
All columns have valid types (int64, float64, bool).

Dataset 72781524237:
All 

In [131]:
# Function to convert object columns to float64
def convert_object_to_float(df):
    for col in df.select_dtypes(include=['object']).columns:
        try:
            df[col] = df[col].astype('float64')
        except ValueError:
            print(f"Error converting column: {col}")
            print(f"Unique values in column {col}: {df[col].unique()}")
            # Print the values that cause the error
            for value in df[col].unique():
                try:
                    float(value)
                except ValueError:
                    print(f"Problematic value in column {col}: {value}")
            
    return df

# Apply the conversion to each dataframe in the dictionary
for key, df in ws_trim_stage_8.items():
    ws_trim_stage_8[key] = convert_object_to_float(df)

# Verify the result
sample_df = ws_trim_stage_8[72790024141]
print(sample_df.dtypes)

DailyWindDirection          float64
DailyDewPointTemperature    float64
DailyStationPressure        float64
DailyWindSpeed              float64
DailyPressureTendency       float64
DailyAltimeterSetting       float64
DailyWetBulbTemperature     float64
DailyDryBulbTemperature     float64
DailySeaLevelPressure       float64
DailyRelativeHumidity       float64
DailyPressureChange         float64
DailyPrecipitation          float64
AW_FG                          bool
AW_*                           bool
AU_FG:2                        bool
AU_FZ:8                        bool
AW_RA                          bool
AU_-DZ:01                      bool
AU_+RA:02                      bool
AU_VCFG:2                      bool
AU_+SN:03                      bool
AW_HAIL                        bool
AW_SQ                          bool
AU_-RA:02                      bool
MW_FG                          bool
AW_HZ                          bool
AU_FU:3                        bool
MW_SH                       

In [132]:
# Directory to save the processed CSV files
output_directory = 'processed-final-data'

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Export each dataframe to a CSV file
for key, df in ws_trim_stage_8.items():
    output_filepath = os.path.join(output_directory, f'{key}.csv')
    df.to_csv(output_filepath, index=False)

print("Dataframes have been exported successfully.")

Dataframes have been exported successfully.
