In [2]:
import pandas as pd
import io
import re


In [None]:
# Loading the CSV data
csv_data = "raw_data.csv"
df = pd.read_csv(csv_data)

# Cleaning the 'hour' column to extract only HH:MM
def clean_hour(hour_str):
    if isinstance(hour_str, str):
        # Extract HH:MM using regex to match time at the start
        match = re.match(r'(\d{2}:\d{2})', hour_str)
        return match.group(1) if match else hour_str
    return hour_str

df['hour'] = df['hour'].apply(clean_hour)

# Splitting date and time into separate columns
# Extract year, month, day from 'date'
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Extract hour and minute from 'hour'
df['time'] = pd.to_datetime(df['hour'], format='%H:%M', errors='coerce')
df['hour_value'] = df['time'].dt.hour
df['minute'] = df['time'].dt.minute

# Dropping temporary 'time' and original 'date' and 'hour' columns
df = df.drop(columns=['date', 'hour', 'time'])

# Filling missing values in 'visibility'
# Replace 'N/A' with None
df['visibility'] = df['visibility'].replace('N/A', None)
# Convert to numeric, coercing errors to NaN
df['visibility'] = pd.to_numeric(df['visibility'], errors='coerce')
# Use forward fill for missing values
df['visibility'] = df['visibility'].fillna(method='ffill')
# Alternatively, use interpolation for smoother filling (uncomment if preferred)
# df['visibility'] = df['visibility'].interpolate(method='linear')

# Reordering columns for clarity
df = df[['year', 'month', 'day', 'hour_value', 'minute', 'temperature', 'wind_speed', 
         'wind_angle', 'humidity', 'pressure', 'visibility']]
df.head()

  df['visibility'] = df['visibility'].fillna(method='ffill')


Unnamed: 0,year,month,day,hour_value,minute,temperature,wind_speed,wind_angle,humidity,pressure,visibility
0,2020,1,1,0,0,24.0,11.0,90.0,89.0,1021.0,6.0
1,2020,1,1,0,30,25.0,9.0,90.0,83.0,1021.0,6.0
2,2020,1,1,1,0,25.0,9.0,90.0,83.0,1020.0,6.0
3,2020,1,1,1,30,24.0,9.0,90.0,89.0,1020.0,6.0
4,2020,1,1,2,0,25.0,6.0,100.0,89.0,1020.0,6.0


In [None]:
# Saving the processed data to a new CSV
output = "clean_data.csv"
# Assuming a function to save the file exists, e.g., 
df.to_csv(output, index=False)
df.head()