In [1]:
import os
import pandas as pd

# Directory containing the CSV files
directory = 'draft-final-data-2'

# Dictionary to store the DataFrames
weather_stations = {}

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        
        # Store the DataFrame in the dictionary
        weather_stations[int(filename[:-4])] = df

# Display the keys of the dictionary to verify
print(weather_stations.keys())

dict_keys([72790024141, 72785524114, 72789094197, 72793024233, 72785794129, 72788594266, 72797624217, 72785024157, 72797094240, 72798594276, 72792424223, 72792894263, 72781024243, 72781524237, 72788324220, 72698824219, 72793894274, 74206024207, 72782724110, 72793724222, 72792594227, 72782594239, 72794504205, 72792394225, 72784524163, 72792024227, 72785694176])


In [2]:
# Pick one dataframe from the dictionary
sample_df = weather_stations[72790024141]

# Show the column names and their data types
print(sample_df.dtypes)

DATE                         object
STATION                       int64
DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation          float64
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindSpeed              float64
dtype: object


In [3]:
# Create a copy of the dictionary
weather_stations_trim = {key: df.copy() for key, df in weather_stations.items()}

# Drop the specified columns from the copied dictionary
for key in weather_stations_trim:
    weather_stations_trim[key] = weather_stations_trim[key].drop(columns=['STATION', 'DATE'])

# Verify the columns have been dropped
sample_df = weather_stations_trim[72790024141]
print(sample_df.dtypes)

DailyAltimeterSetting       float64
DailyDewPointTemperature    float64
DailyDryBulbTemperature     float64
DailyPrecipitation          float64
DailyPresentWeatherType      object
DailyPressureChange         float64
DailyPressureTendency       float64
DailyRelativeHumidity       float64
DailySeaLevelPressure       float64
DailySkyConditions           object
DailyStationPressure        float64
DailyWetBulbTemperature     float64
DailyWindDirection           object
DailyWindSpeed              float64
dtype: object


In [4]:
# Initialize sets to store unique values
unique_precipitation = set()
unique_weather_type = set()
unique_sky_conditions = set()
unique_wind_direction = set()

# Iterate through each dataframe in the dictionary
for key, df in weather_stations_trim.items():
    unique_precipitation.update(df['DailyPrecipitation'].unique())
    unique_weather_type.update(df['DailyPresentWeatherType'].unique())
    unique_sky_conditions.update(df['DailySkyConditions'].unique())
    unique_wind_direction.update(df['DailyWindDirection'].unique())

# Convert sets to lists for better readability
unique_precipitation = list(unique_precipitation)
unique_weather_type = list(unique_weather_type)
unique_sky_conditions = list(unique_sky_conditions)
unique_wind_direction = list(unique_wind_direction)

# Display the unique values
print("Unique DailyPrecipitation:", unique_precipitation)
print("Unique DailyPresentWeatherType:", unique_weather_type)
print("Unique DailySkyConditions:", unique_sky_conditions)
print("Unique DailyWindDirection:", unique_wind_direction)

Unique DailyPrecipitation: [0.0, 0.015625, 0.03125, 0.01953125, 0.01171875, 0.0078125, 0.00390625, 0.0378947368421052, 0.078125, 0.0625, 0.0546875, 0.0201190476190476, 0.0282499999999999, 0.0469354838709677, 0.0271111111111111, 0.0001219512195121, 0.0035185185185185, 0.0134821428571428, 0.0255813953488371, 0.0157317073170731, 0.0372413793103448, 0.0179761904761904, 0.0144594594594594, 0.0204651162790697, 0.0001136363636363, 0.015735294117647, 0.0188333333333333, 0.0626136363636363, 0.0583898305084745, 0.0313636363636363, 0.0014492753623188, 0.0258695652173913, 0.007037037037037, 0.0205128205128205, 0.0157575757575757, 0.0021739130434782, 0.0119827586206896, 0.0107, 0.0105555555555555, 0.0121153846153846, 0.0376666666666666, 0.0001515151515151, 0.0063207547169811, 9.80392156862745e-05, 0.0322368421052631, 0.014074074074074, 0.0056756756756756, 0.0132653061224489, 0.0132894736842105, 0.0001612903225806, 0.0280434782608695, 0.0306976744186046, 0.0157954545454545, 0.0470454545454545, 0.028

In [5]:
import numpy as np

def convert_to_radians(value):
    try:
        # Convert to int
        value_int = int(value)
        # Convert degrees to radians
        return np.deg2rad(value_int)
    except (ValueError, TypeError):
        # If conversion fails, return -1
        return -1

# Create a copy of the dictionary
ws_trim_stage_1 = {key: df.copy() for key, df in weather_stations_trim.items()}

# Apply the conversion to each dataframe in the copied dictionary
for key, df in ws_trim_stage_1.items():
    df['DailyWindDirection'] = df['DailyWindDirection'].apply(convert_to_radians)

# Verify the conversion
sample_df = ws_trim_stage_1[72790024141]
print(sample_df['DailyWindDirection'].head())

0    4.188790
1    0.000000
2    0.000000
3    3.839724
4    0.174533
Name: DailyWindDirection, dtype: float64


In [6]:
# Check for null values in the 'DailyPresentWeatherType' column for each dataframe
null_values = {key: df['DailyPresentWeatherType'].isnull().sum() for key, df in ws_trim_stage_1.items()}

# Display the results
display(null_values)

{72790024141: 0,
 72785524114: 0,
 72789094197: 0,
 72793024233: 0,
 72785794129: 0,
 72788594266: 0,
 72797624217: 0,
 72785024157: 0,
 72797094240: 0,
 72798594276: 0,
 72792424223: 0,
 72792894263: 0,
 72781024243: 0,
 72781524237: 0,
 72788324220: 0,
 72698824219: 0,
 72793894274: 0,
 74206024207: 0,
 72782724110: 0,
 72793724222: 0,
 72792594227: 0,
 72782594239: 0,
 72794504205: 0,
 72792394225: 0,
 72784524163: 0,
 72792024227: 0,
 72785694176: 0}

In [7]:
# Create a copy of ws_trim_stage_1
ws_trim_stage_2 = {key: df.copy() for key, df in ws_trim_stage_1.items()}

# # Trim the number of entries to 658
# for key in ws_trim_stage_2:
#     ws_trim_stage_2[key] = ws_trim_stage_2[key].iloc[:658]

# # Verify the trimming
# sample_df = ws_trim_stage_2[72790024141]
# print(sample_df.shape)

# Check for null values in each column for each dataframe in the trimmed dataframes
null_values_trimmed = {key: df.isnull().sum() for key, df in ws_trim_stage_2.items()}

# Display the results
for key, nulls in null_values_trimmed.items():
    print(f"Dataset {key}:")
    print(nulls)
    print()

Dataset 72790024141:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197:
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryB

In [8]:
# Create a copy of ws_trim_stage_2
ws_trim_stage_3 = {key: df.copy() for key, df in ws_trim_stage_2.items()}

# Drop dataframes that contain one or more columns with all null values
ws_trim_stage_3 = {key: df for key, df in ws_trim_stage_3.items() if not df.isnull().all(axis=0).any()}

# Verify the result
# Dictionary to store the total null elements for each dataframe
null_elements_per_df = {}

for key, df in ws_trim_stage_3.items():
    print(f"Dataset {key}:", df.isnull().sum().sum())
    print(df.isnull().sum())
    print()


Dataset 72790024141: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197: 0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
Dai

In [9]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_3.items():
    # Filter rows with any null values
    null_entries = df[df.isnull().any(axis=1)]
    
    # Display the dataframe key and the null entries
    print(f"Dataset {key}:")
    display(null_entries)
    print()

Dataset 72790024141:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785524114:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72789094197:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793024233:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785794129:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72788594266:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72797624217:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785024157:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72797094240:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72798594276:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792424223:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792894263:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72781024243:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72781524237:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72788324220:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72698824219:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793894274:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 74206024207:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72782724110:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72793724222:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792594227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72782594239:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72794504205:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792394225:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72784524163:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72792024227:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed



Dataset 72785694176:


Unnamed: 0,DailyAltimeterSetting,DailyDewPointTemperature,DailyDryBulbTemperature,DailyPrecipitation,DailyPresentWeatherType,DailyPressureChange,DailyPressureTendency,DailyRelativeHumidity,DailySeaLevelPressure,DailySkyConditions,DailyStationPressure,DailyWetBulbTemperature,DailyWindDirection,DailyWindSpeed





In [10]:
# Create a copy of ws_trim_stage_3
ws_trim_stage_4 = {key: df.copy() for key, df in ws_trim_stage_3.items()}

# Remove any dataframe that contains null values
ws_trim_stage_4 = {key: df for key, df in ws_trim_stage_4.items() if not df.isnull().values.any()}

# Verify the result
for key, df in ws_trim_stage_4.items():
    print(f"Dataset {key} has no null values.")
    print(df.isnull().sum())
    print()

Dataset 72790024141 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72785524114 has no null values.
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPrecipitation          0
DailyPresentWeatherType     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailySkyConditions          0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindSpeed              0
dtype: int64

Dataset 72789094197 has no null values.
DailyAltimet

In [11]:
num_dataframes = len(ws_trim_stage_4)
print(f"There are {num_dataframes} dataframes in the dataset.")

total_entries = sum(len(df) for df in ws_trim_stage_4.values())
print(f"There are {total_entries} total entries in the dataset.")

There are 27 dataframes in the dataset.
There are 47574 total entries in the dataset.


In [12]:
# Create a copy of ws_trim_stage_4
ws_trim_stage_5 = {key: df.copy() for key, df in ws_trim_stage_4.items()}

# Split the 'DailyPresentWeatherType' column into three separate columns
for key, df in ws_trim_stage_5.items():
    df[['AU', 'AW', 'MW']] = df['DailyPresentWeatherType'].str.split('|', expand=True)

# Verify the result
sample_df = ws_trim_stage_5[72790024141]
print(sample_df[['DailyPresentWeatherType', 'AU', 'AW', 'MW']].head())

  DailyPresentWeatherType       AU   AW  MW
0          -RA:02 |RA |RA  -RA:02   RA   RA
1          -RA:02 |RA |RA  -RA:02   RA   RA
2                 BR:1 ||    BR:1          
3                UP:09 ||   UP:09          
4                UP:09 ||   UP:09          


In [13]:
# Dictionary to store the unique value counts for each dataset
unique_counts = {}

# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_5.items():
    unique_counts[key] = {
        'AU': df['AU'].nunique(),
        'AW': df['AW'].nunique(),
        'MW': df['MW'].nunique()
    }

# Display the unique counts
for key, counts in unique_counts.items():
    print(f"Dataset {key}:")
    print(f"AU: {counts['AU']}, AW: {counts['AW']}, MW: {counts['MW']}")
    print()



Dataset 72790024141:
AU: 19, AW: 10, MW: 4

Dataset 72785524114:
AU: 33, AW: 17, MW: 9

Dataset 72789094197:
AU: 22, AW: 8, MW: 4

Dataset 72793024233:
AU: 27, AW: 15, MW: 8

Dataset 72785794129:
AU: 20, AW: 9, MW: 3

Dataset 72788594266:
AU: 12, AW: 5, MW: 4

Dataset 72797624217:
AU: 25, AW: 12, MW: 8

Dataset 72785024157:
AU: 39, AW: 23, MW: 14

Dataset 72797094240:
AU: 15, AW: 5, MW: 3

Dataset 72798594276:
AU: 17, AW: 7, MW: 4

Dataset 72792424223:
AU: 16, AW: 6, MW: 3

Dataset 72792894263:
AU: 15, AW: 5, MW: 3

Dataset 72781024243:
AU: 21, AW: 9, MW: 5

Dataset 72781524237:
AU: 24, AW: 10, MW: 4

Dataset 72788324220:
AU: 18, AW: 8, MW: 4

Dataset 72698824219:
AU: 20, AW: 9, MW: 4

Dataset 72793894274:
AU: 21, AW: 8, MW: 6

Dataset 74206024207:
AU: 24, AW: 15, MW: 9

Dataset 72782724110:
AU: 30, AW: 12, MW: 7

Dataset 72793724222:
AU: 24, AW: 11, MW: 8

Dataset 72792594227:
AU: 17, AW: 6, MW: 3

Dataset 72782594239:
AU: 20, AW: 7, MW: 4

Dataset 72794504205:
AU: 15, AW: 5, MW: 3

D

In [14]:
# Create a copy of ws_trim_stage_5
ws_trim_stage_6 = {key: df.copy() for key, df in ws_trim_stage_5.items()}

# Function to perform one-hot encoding
def one_hot_encode(df, column):
    # Split the content by empty space
    split_data = df[column].str.split(expand=True)
    # Perform one-hot encoding
    one_hot_encoded = pd.get_dummies(split_data, prefix=column)
    # Drop the original column and concatenate the one-hot encoded columns
    df = df.drop(columns=[column])
    df = pd.concat([df, one_hot_encoded], axis=1)
    return df

# Apply the one-hot encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_6.items():
    df = one_hot_encode(df, 'AU')
    df = one_hot_encode(df, 'AW')
    df = one_hot_encode(df, 'MW')
    ws_trim_stage_6[key] = df

# Verify the result
sample_df = ws_trim_stage_6[72790024141]
print(sample_df.head())

   DailyAltimeterSetting  DailyDewPointTemperature  DailyDryBulbTemperature  \
0              29.657600                 34.560000                45.200000   
1              29.996667                 30.000000                37.848485   
2              29.926038                 35.226415                37.320755   
3              30.093333                 26.833333                40.791667   
4              30.223750                 27.666667                34.750000   

   DailyPrecipitation DailyPresentWeatherType  DailyPressureChange  \
0            0.002917          -RA:02 |RA |RA            -0.023750   
1            0.003226          -RA:02 |RA |RA            -0.027500   
2            0.000000                 BR:1 ||             0.031429   
3            0.000000                UP:09 ||            -0.063750   
4            0.000625                UP:09 ||             0.007500   

   DailyPressureTendency  DailyRelativeHumidity  DailySeaLevelPressure  \
0               2.750000      

In [15]:
# Collect all unique column names from all dataframes
all_columns = set()
for df in ws_trim_stage_6.values():
    all_columns.update(df.columns)

# Create a new dictionary to store the updated dataframes
ws_trim_stage_7 = {}

# Ensure each dataframe has all the collected columns, filling non-existent columns with False
for key, df in ws_trim_stage_6.items():
    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]
    # Reindex the dataframe to have all columns, filling missing columns with False
    df = df.reindex(columns=all_columns, fill_value=False)
    # Drop the 'DailyPresentWeatherType' column
    df = df.drop(columns=['DailyPresentWeatherType'])
    df = df.drop(columns=['DailySkyConditions'])
    # Move all columns that start with 'Daily' to the front
    daily_columns = [col for col in df.columns if col.startswith('Daily')]
    other_columns = [col for col in df.columns if not col.startswith('Daily')]
    df = df[daily_columns + other_columns]
    # Store the updated dataframe in the new dictionary
    ws_trim_stage_7[key] = df


# Verify the result
sample_df = ws_trim_stage_7[72790024141]
print(sample_df.head())

   DailyDewPointTemperature  DailyPressureTendency  DailyWindSpeed  \
0                 34.560000               2.750000       11.600000   
1                 30.000000               3.750000        5.969697   
2                 35.226415               6.714286        3.849057   
3                 26.833333               2.375000       12.500000   
4                 27.666667               5.375000        7.875000   

   DailySeaLevelPressure  DailyStationPressure  DailyRelativeHumidity  \
0              29.674583             28.328800              67.640000   
1              30.004167             28.657273              76.000000   
2              29.929091             28.586604              93.075472   
3              30.115417             28.749167              59.000000   
4              30.257500             28.874167              76.458333   

   DailyPrecipitation  DailyWetBulbTemperature  DailyAltimeterSetting  \
0            0.002917                40.480000              29.6576

In [16]:
# Iterate through each dataframe in the dictionary and print the number of columns
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key} has {df.shape[1]} columns.")

Dataset 72790024141 has 70 columns.
Dataset 72785524114 has 70 columns.
Dataset 72789094197 has 70 columns.
Dataset 72793024233 has 70 columns.
Dataset 72785794129 has 70 columns.
Dataset 72788594266 has 70 columns.
Dataset 72797624217 has 70 columns.
Dataset 72785024157 has 70 columns.
Dataset 72797094240 has 70 columns.
Dataset 72798594276 has 70 columns.
Dataset 72792424223 has 70 columns.
Dataset 72792894263 has 70 columns.
Dataset 72781024243 has 70 columns.
Dataset 72781524237 has 70 columns.
Dataset 72788324220 has 70 columns.
Dataset 72698824219 has 70 columns.
Dataset 72793894274 has 70 columns.
Dataset 74206024207 has 70 columns.
Dataset 72782724110 has 70 columns.
Dataset 72793724222 has 70 columns.
Dataset 72792594227 has 70 columns.
Dataset 72782594239 has 70 columns.
Dataset 72794504205 has 70 columns.
Dataset 72792394225 has 70 columns.
Dataset 72784524163 has 70 columns.
Dataset 72792024227 has 70 columns.
Dataset 72785694176 has 70 columns.


In [17]:
# Iterate through each dataframe in the dictionary
for key, df in ws_trim_stage_7.items():
    print(f"Dataset {key}:")
    
    # Count the number of null values in each dataframe
    null_count = df.isnull().sum().sum()
    print(f"Total null values: {null_count}")
    
    # Check if the dataframe contains a column whose type is not int, float, or bool
    valid_types = [np.dtype('int64'), np.dtype('float64'), np.dtype('bool')]
    invalid_types = df.dtypes[~df.dtypes.isin(valid_types)]
    if not invalid_types.empty:
        print("Columns with invalid types:")
        print(invalid_types)
    else:
        print("All columns have valid types (int, float, bool).")
    print()


Dataset 72790024141:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785524114:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72789094197:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72793024233:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785794129:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72788594266:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72797624217:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72785024157:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72797094240:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72798594276:
Total null values: 0
All columns have valid types (int, float, bool).

Dataset 72792424223:
Total null values: 0
All columns have valid types (int, flo

In [18]:
def display_invalid_entries(dataframes):
    for key, df in dataframes.items():
        print(f"Dataset {key}:")
        
        # Select columns that are not of type int64, float64, or bool
        invalid_columns = df.select_dtypes(exclude=['int64', 'float64', 'bool']).columns
        
        # Display the entries with invalid types
        if not invalid_columns.empty:
            invalid_entries = df[invalid_columns]
            display(invalid_entries)
        else:
            print("All columns have valid types (int64, float64, bool).")
        print()

    # Call the function with ws_trim_stage_7
display_invalid_entries(ws_trim_stage_7)

Dataset 72790024141:
All columns have valid types (int64, float64, bool).

Dataset 72785524114:
All columns have valid types (int64, float64, bool).

Dataset 72789094197:
All columns have valid types (int64, float64, bool).

Dataset 72793024233:
All columns have valid types (int64, float64, bool).

Dataset 72785794129:
All columns have valid types (int64, float64, bool).

Dataset 72788594266:
All columns have valid types (int64, float64, bool).

Dataset 72797624217:
All columns have valid types (int64, float64, bool).

Dataset 72785024157:
All columns have valid types (int64, float64, bool).

Dataset 72797094240:
All columns have valid types (int64, float64, bool).

Dataset 72798594276:
All columns have valid types (int64, float64, bool).

Dataset 72792424223:
All columns have valid types (int64, float64, bool).

Dataset 72792894263:
All columns have valid types (int64, float64, bool).

Dataset 72781024243:
All columns have valid types (int64, float64, bool).

Dataset 72781524237:
All 

In [19]:
# Create a copy of ws_trim_stage_7
ws_trim_stage_8 = {key: df.copy() for key, df in ws_trim_stage_7.items()}

# Function to encode 'DailyPrecipitation' values
def encode_precipitation(value):
    if value == 'T':
        return 0.05
    try:
        return float(value)
    except ValueError:
        return np.nan

# Apply the encoding to each dataframe in the copied dictionary
for key, df in ws_trim_stage_8.items():
    df['DailyPrecipitation'] = df['DailyPrecipitation'].apply(encode_precipitation).astype('float64')

# Verify the result
sample_df = ws_trim_stage_8[72790024141]
print(sample_df['DailyPrecipitation'].dtype)

float64


In [20]:
display_invalid_entries(ws_trim_stage_8)

Dataset 72790024141:
All columns have valid types (int64, float64, bool).

Dataset 72785524114:
All columns have valid types (int64, float64, bool).

Dataset 72789094197:
All columns have valid types (int64, float64, bool).

Dataset 72793024233:
All columns have valid types (int64, float64, bool).

Dataset 72785794129:
All columns have valid types (int64, float64, bool).

Dataset 72788594266:
All columns have valid types (int64, float64, bool).

Dataset 72797624217:
All columns have valid types (int64, float64, bool).

Dataset 72785024157:
All columns have valid types (int64, float64, bool).

Dataset 72797094240:
All columns have valid types (int64, float64, bool).

Dataset 72798594276:
All columns have valid types (int64, float64, bool).

Dataset 72792424223:
All columns have valid types (int64, float64, bool).

Dataset 72792894263:
All columns have valid types (int64, float64, bool).

Dataset 72781024243:
All columns have valid types (int64, float64, bool).

Dataset 72781524237:
All 

In [21]:
# Function to convert object columns to float64
def convert_object_to_float(df):
    for col in df.select_dtypes(include=['object']).columns:
        try:
            df[col] = df[col].astype('float64')
        except ValueError:
            print(f"Error converting column: {col}")
            print(f"Unique values in column {col}: {df[col].unique()}")
            # Print the values that cause the error
            for value in df[col].unique():
                try:
                    float(value)
                except ValueError:
                    print(f"Problematic value in column {col}: {value}")
            
    return df

# Apply the conversion to each dataframe in the dictionary
for key, df in ws_trim_stage_8.items():
    ws_trim_stage_8[key] = convert_object_to_float(df)

# Verify the result
sample_df = ws_trim_stage_8[72790024141]
print(sample_df.dtypes)

DailyDewPointTemperature    float64
DailyPressureTendency       float64
DailyWindSpeed              float64
DailySeaLevelPressure       float64
DailyStationPressure        float64
                             ...   
MW_FG                          bool
AW_SHSN                        bool
AU_-DZ:01                      bool
AU_HZ:7                        bool
AU_GS:08                       bool
Length: 70, dtype: object


In [22]:
# Create a copy of ws_trim_stage_8
ws_trim_stage_9 = {key: df.copy() for key, df in ws_trim_stage_8.items()}

# Function to convert boolean columns to float64
def convert_bool_to_float(df):
    bool_columns = df.select_dtypes(include=['bool']).columns
    df[bool_columns] = df[bool_columns].astype('float64')
    return df

# Apply the conversion to each dataframe in the copied dictionary
for key, df in ws_trim_stage_9.items():
    ws_trim_stage_9[key] = convert_bool_to_float(df)

# Verify the result
sample_df = ws_trim_stage_9[72790024141]
print(sample_df.dtypes)

DailyDewPointTemperature    float64
DailyPressureTendency       float64
DailyWindSpeed              float64
DailySeaLevelPressure       float64
DailyStationPressure        float64
                             ...   
MW_FG                       float64
AW_SHSN                     float64
AU_-DZ:01                   float64
AU_HZ:7                     float64
AU_GS:08                    float64
Length: 70, dtype: object


In [23]:
# Directory to save the processed CSV files
output_directory = 'processed-final-data-2'

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Export each dataframe to a CSV file
for key, df in ws_trim_stage_9.items():
    output_filepath = os.path.join(output_directory, f'{key}.csv')
    df.to_csv(output_filepath, index=False)

print("Dataframes have been exported successfully.")

Dataframes have been exported successfully.
