# Data Wrangling



### Import the libraries

We will be mostly using `pandas`.

In [1]:
import pandas as pd

### Load the file

In [2]:
data_frame = pd.read_csv("raw_data.csv")

## Null values

In [3]:
# Check for missing values
data_frame.isnull().sum()

time                   0
wmo                    0
name                   0
history_product        0
air_temp               0
apparent_t             9
dewpt                  0
rel_hum                0
delta_t                0
wind_dir_deg           9
wind_spd_kmh           9
gust_kmh              16
rain_trace           262
rain_ten              30
rain_hour             59
duration_from_9am    262
press                  3
lat                    0
lon                    0
location               0
dtype: int64

In [4]:
# Remove Null values
data_frame = data_frame.dropna()
data_frame.isnull().sum()

time                 0
wmo                  0
name                 0
history_product      0
air_temp             0
apparent_t           0
dewpt                0
rel_hum              0
delta_t              0
wind_dir_deg         0
wind_spd_kmh         0
gust_kmh             0
rain_trace           0
rain_ten             0
rain_hour            0
duration_from_9am    0
press                0
lat                  0
lon                  0
location             0
dtype: int64

## Duplicate Values

In [5]:
# Find duplicates
data_frame.duplicated().sum()

np.int64(0)

In [6]:
# Remove duplicates
data_frame = data_frame.drop_duplicates()
data_frame.duplicated().sum()

np.int64(0)

## Time Management

In [7]:
# Show time values
print(data_frame['time'])

0        2024-06-30T15:00:00+10:00
1        2025-02-08T23:00:00+11:00
2        2025-02-08T19:30:00+11:00
3        2025-02-09T02:00:00+11:00
4        2025-02-08T23:30:00+11:00
                   ...            
13642    2025-03-08T09:00:00+11:00
13643    2025-03-08T08:30:00+11:00
13644    2025-03-08T10:00:00+11:00
13645    2025-03-08T09:30:00+11:00
13646    2025-03-08T10:30:00+11:00
Name: time, Length: 13353, dtype: object


In [11]:
# Remove timezone from time values
data_frame["time"] = data_frame["time"].astype(str)
reeeeee = data_frame["time"].str.split("T")
# data_frame["time"] = data_frame["time"].str.replace(r"\+\d{2}:\d{2}", "", regex=True)
# print(data_frame["time"])
reeeeee = reeeeee.str[1].tolist()  # Convert to list
# print(reeeeee)

reee11 = []
reee10 = []
for time_value in reeeeee:
    if "+11:00" in time_value:
        reee11.append(time_value)
    else:
        reee10.append(time_value)

# Modify elements in reee10
for i in range(len(reee10)):
    reee10[i] = reee10[i].split("+10:00")[0]

# Modify elements in reee11
ti = []
for i in range(len(reee11)):
    ti.append(reee11[i].split("+11:00")[0])

# Split time components in ti
time_components = []
for time_value in ti:
    ti_H, ti_M, ti_S = time_value.split(":")
    time_components.append((ti_H, ti_M, ti_S))

print(reee10)
print(time_components)

['15:00:00', '18:00:00', '19:00:00', '20:00:00', '21:00:00', '22:00:00', '00:00:00', '03:00:00', '14:00:00', '03:30:00', '14:30:00', '15:00:00', '05:00:00', '16:30:00', '07:00:00', '20:00:00', '21:00:00', '07:30:00', '22:30:00', '08:30:00', '09:00:00', '00:00:00', '23:00:00', '01:00:00', '10:00:00', '10:30:00', '02:30:00', '03:00:00', '11:30:00', '13:00:00', '05:30:00', '06:00:00', '19:00:00', '20:00:00', '10:30:00', '21:00:00', '21:30:00', '14:00:00', '16:00:00', '14:30:00', '15:00:00', '18:30:00', '17:00:00', '03:00:00', '03:30:00', '04:00:00', '06:00:00', '21:30:00', '07:30:00', '08:30:00', '09:30:00', '22:30:00', '00:00:00', '15:00:00', '16:00:00', '17:30:00', '01:30:00', '18:00:00', '02:30:00', '03:00:00', '20:48:00', '22:30:00', '21:00:00', '04:30:00', '23:30:00', '06:00:00', '02:00:00', '03:00:00', '08:30:00', '09:30:00', '08:30:00', '09:30:00', '10:30:00', '11:00:00', '10:00:00', '11:30:00', '10:30:00', '17:16:00', '16:52:00', '12:30:00', '13:00:00', '19:52:00', '20:00:00', '13

In [None]:
# Remove T from time values
data_frame['time'] = data_frame['time'].str.replace('T', ' ')
print(data_frame['time'])

In [None]:
# Convert values into UNIX timestamp in seconds
data_frame['time'] = pd.to_datetime(data_frame['time'])
data_frame['time'] = data_frame['time'].astype(int) // 10**9
print(data_frame['time'])

In [None]:
# Sort values by time
data_frame = data_frame.sort_values(by='time')
print(data_frame)

## Removing Unneeded Columns
These columns do not change and will add further unnecessary data.

In [None]:
# Drop columns that are not needed
"""
Icluded columns:
- wmo
- name
- history_product
- lat
- lon
- location
"""

data_frame = data_frame.drop(columns=['wmo', 'name', 'history_product', 'lat', 'lon', 'location'])
print(data_frame)

## Scaling Features

In [None]:
element = 'temp'

