In [27]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [28]:
df = pd.read_csv('./data/daily_data.csv')
print(df.head(1))

  day_id city_id  temperature_celsius condition_text  wind_kph  wind_degree  \
0  D0001    C001                 27.0            NaN       6.1          210   

   pressure_mb  precip_mm  humidity  cloud  feels_like_celsius  visibility_km  \
0       1006.0        0.0        54     75                28.0           10.0   

   uv_index  gust_kph  air_quality_us-epa-index   sunrise    sunset  
0       6.0      11.9                         2  06:04 AM  07:19 PM  


In [29]:
df.count()

day_id                      2893
city_id                     2893
temperature_celsius         2893
condition_text               479
wind_kph                    2893
wind_degree                 2893
pressure_mb                 2893
precip_mm                   2893
humidity                    2893
cloud                       2893
feels_like_celsius          2893
visibility_km               2893
uv_index                    2893
gust_kph                    2893
air_quality_us-epa-index    2893
sunrise                     2893
sunset                      2893
dtype: int64

In [30]:
df.describe()

Unnamed: 0,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index
count,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0
mean,22.024577,10.304805,158.994469,1013.745938,0.165299,74.773591,32.07328,23.725752,9.745524,2.21673,16.930522,1.599032
std,6.61554,7.264472,103.021634,6.172045,0.937061,19.335724,32.584184,8.877424,2.489918,2.238163,10.541449,1.062513
min,-2.0,3.6,1.0,982.0,0.0,4.0,0.0,-4.3,0.0,1.0,0.7,1.0
25%,17.0,5.0,70.0,1010.0,0.0,66.0,0.0,17.0,10.0,1.0,9.0,1.0
50%,22.0,8.3,150.0,1014.0,0.0,78.0,25.0,24.5,10.0,1.0,15.1,1.0
75%,27.0,13.0,236.0,1018.0,0.0,89.0,68.0,30.0,10.0,1.0,22.3,2.0
max,45.0,74.2,360.0,1036.0,28.7,100.0,100.0,73.6,32.0,10.0,75.6,6.0


In [31]:
# Filter records with non-empty 'condition_text'
df_filtered = df.loc[df['condition_text'].isnull()]
df_filtered.count()

day_id                      2414
city_id                     2414
temperature_celsius         2414
condition_text                 0
wind_kph                    2414
wind_degree                 2414
pressure_mb                 2414
precip_mm                   2414
humidity                    2414
cloud                       2414
feels_like_celsius          2414
visibility_km               2414
uv_index                    2414
gust_kph                    2414
air_quality_us-epa-index    2414
sunrise                     2414
sunset                      2414
dtype: int64

In [32]:
# Function to convert time to minutes past midnight
def time_to_minutes(time_series):
    # Convert to datetime
    time_dt = pd.to_datetime(time_series, format='%I:%M %p')
    # Calculate minutes past midnight
    minutes = time_dt.dt.hour * 60 + time_dt.dt.minute
    return minutes.values


df_filtered.loc[:, 'sunrise_num'] = time_to_minutes(
    df_filtered['sunrise']).astype(int)
df_filtered.loc[:, 'sunset_num'] = time_to_minutes(
    df_filtered['sunset']).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[:, 'sunrise_num'] = time_to_minutes(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[:, 'sunset_num'] = time_to_minutes(


In [33]:
df_filtered.head(1)

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,sunrise_num,sunset_num
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM,364,1159


In [34]:
scaler = MinMaxScaler()
numeric_cols = df_filtered.select_dtypes(include=[np.number]).columns.tolist()

# Use .loc to avoid SettingWithCopyWarning when scaling numeric columns
df_filtered.loc[:, numeric_cols] = scaler.fit_transform(
    df_filtered[numeric_cols])


In [35]:
numeric_cols

['temperature_celsius',
 'wind_kph',
 'wind_degree',
 'pressure_mb',
 'precip_mm',
 'humidity',
 'cloud',
 'feels_like_celsius',
 'visibility_km',
 'uv_index',
 'gust_kph',
 'air_quality_us-epa-index',
 'sunrise_num',
 'sunset_num']

In [36]:
df_filtered.head(1)

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,sunrise_num,sunset_num
0,D0001,C001,0.617021,,0.035411,0.582173,0.444444,0.0,0.510638,0.75,0.414634,0.3125,0.555556,0.149533,0.2,06:04 AM,07:19 PM,0.311765,0.542714


In [37]:
# Drop the specified columns
df_filtered = df_filtered.drop(columns=['sunrise', 'sunset', 'condition_text'])

# Save the resulting DataFrame to a CSV file with a custom delimiter '-d'
df_filtered.to_csv('./processed_data/filtered_input.csv', index=False)