In [141]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [142]:
df = pd.read_csv('./data/daily_data.csv')
print(df.head(1))

  day_id city_id  temperature_celsius condition_text  wind_kph  wind_degree  \
0  D0001    C001                 27.0            NaN       6.1          210   

   pressure_mb  precip_mm  humidity  cloud  feels_like_celsius  visibility_km  \
0       1006.0        0.0        54     75                28.0           10.0   

   uv_index  gust_kph  air_quality_us-epa-index   sunrise    sunset  
0       6.0      11.9                         2  06:04 AM  07:19 PM  


In [143]:
df.describe()

Unnamed: 0,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index
count,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0,2893.0
mean,22.024577,10.304805,158.994469,1013.745938,0.165299,74.773591,32.07328,23.725752,9.745524,2.21673,16.930522,1.599032
std,6.61554,7.264472,103.021634,6.172045,0.937061,19.335724,32.584184,8.877424,2.489918,2.238163,10.541449,1.062513
min,-2.0,3.6,1.0,982.0,0.0,4.0,0.0,-4.3,0.0,1.0,0.7,1.0
25%,17.0,5.0,70.0,1010.0,0.0,66.0,0.0,17.0,10.0,1.0,9.0,1.0
50%,22.0,8.3,150.0,1014.0,0.0,78.0,25.0,24.5,10.0,1.0,15.1,1.0
75%,27.0,13.0,236.0,1018.0,0.0,89.0,68.0,30.0,10.0,1.0,22.3,2.0
max,45.0,74.2,360.0,1036.0,28.7,100.0,100.0,73.6,32.0,10.0,75.6,6.0


In [144]:
# Filter records with non-empty 'condition_text'
df_filtered = df.dropna(subset=['condition_text'])
df_filtered.count()

day_id                      479
city_id                     479
temperature_celsius         479
condition_text              479
wind_kph                    479
wind_degree                 479
pressure_mb                 479
precip_mm                   479
humidity                    479
cloud                       479
feels_like_celsius          479
visibility_km               479
uv_index                    479
gust_kph                    479
air_quality_us-epa-index    479
sunrise                     479
sunset                      479
dtype: int64

In [145]:
df_filtered.head(1)
df_filtered.dtypes

day_id                       object
city_id                      object
temperature_celsius         float64
condition_text               object
wind_kph                    float64
wind_degree                   int64
pressure_mb                 float64
precip_mm                   float64
humidity                      int64
cloud                         int64
feels_like_celsius          float64
visibility_km               float64
uv_index                    float64
gust_kph                    float64
air_quality_us-epa-index      int64
sunrise                      object
sunset                       object
dtype: object

In [146]:
# Function to convert time to minutes past midnight
def time_to_minutes(time_series):
    # Convert to datetime
    time_dt = pd.to_datetime(time_series, format='%I:%M %p')
    # Calculate minutes past midnight
    minutes = time_dt.dt.hour * 60 + time_dt.dt.minute
    return minutes.values


df_filtered.loc[:, 'sunrise_num'] = time_to_minutes(
    df_filtered['sunrise']).astype(int)
df_filtered.loc[:, 'sunset_num'] = time_to_minutes(
    df_filtered['sunset']).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[:, 'sunrise_num'] = time_to_minutes(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[:, 'sunset_num'] = time_to_minutes(


In [147]:
df_filtered.head(1)

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,sunrise_num,sunset_num
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM,365,1158


In [148]:
df_filtered.dtypes

day_id                       object
city_id                      object
temperature_celsius         float64
condition_text               object
wind_kph                    float64
wind_degree                   int64
pressure_mb                 float64
precip_mm                   float64
humidity                      int64
cloud                         int64
feels_like_celsius          float64
visibility_km               float64
uv_index                    float64
gust_kph                    float64
air_quality_us-epa-index      int64
sunrise                      object
sunset                       object
sunrise_num                   int32
sunset_num                    int32
dtype: object

In [149]:
scaler = MinMaxScaler()
numeric_cols = df_filtered.select_dtypes(include=[np.number]).columns.tolist()

# Use .loc to avoid SettingWithCopyWarning when scaling numeric columns
df_filtered.loc[:, numeric_cols] = scaler.fit_transform(
    df_filtered[numeric_cols])


In [150]:

numeric_cols

['temperature_celsius',
 'wind_kph',
 'wind_degree',
 'pressure_mb',
 'precip_mm',
 'humidity',
 'cloud',
 'feels_like_celsius',
 'visibility_km',
 'uv_index',
 'gust_kph',
 'air_quality_us-epa-index',
 'sunrise_num',
 'sunset_num']

In [151]:
df_filtered.head(1)

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,sunrise_num,sunset_num
2,D0003,C001,0.46875,Light Rain with Thunder,0.0,0.016854,0.470588,0.229592,1.0,0.75,0.371257,0.308176,0.0,0.187697,0.0,06:05 AM,07:18 PM,0.302469,0.527638


In [152]:
# Drop the specified columns
df_filtered = df_filtered.drop(columns=['sunrise', 'sunset', 'city_id', 'day_id'])

# Save the resulting DataFrame to a CSV file with a custom delimiter '-d'
df_filtered.to_csv('./processed_data/filtered_data.csv', index=False)

In [154]:
# Step 1: Get unique values
unique_values = df_filtered["condition_text"].unique()

# Step 2 and 3: Create a dictionary with unique values as keys and their indices as values
unique_dict = {value: index for index, value in enumerate(unique_values)}
print(unique_dict)

{'Light Rain with Thunder': 0, 'Clear and Sunny': 1, 'Partly Cloudy': 2, 'Light Precipitation': 3, 'Cloudy and Overcast': 4, 'Mist or Fog': 5, 'Rain Showers': 6, 'Moderate to Heavy Rain': 7, 'Thunderstorms': 8}


In [155]:
df_filtered['condition_text'] = df_filtered['condition_text'].replace(unique_dict)

In [156]:
df_filtered.to_csv('./processed_data/filtered_data_with_classes.csv', index=False)