# Weathe-Forecast data preprocessing

In [1]:
# Importing required libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra

In [2]:
# Loading the dataset with use of pandas
weather = pd.read_csv("weather_cleaned.csv")
weather.head()

Unnamed: 0,Date,Temperature,Average_humidity (%),Average_dewpoint (°F),Average_barometer (in),Average_windspeed (mph),Average_gustspeed (mph),Average_direction (°deg),Rainfall_for_month (in),Rainfall_for_year (in),...,Minimum_temperature (°F),Maximum_humidity (%),Minimum_humidity (%),Maximum_pressure,Minimum_pressure,Maximum_windspeed (mph),Maximum_gust_speed (mph),Maximum_heat_index (°F),Date1,diff_pressure
0,1/1/2022,37.8,35.0,12.7,29.7,26.4,36.8,274.0,0.0,0.0,...,34.0,4.0,27.0,29.76,29.6,41.4,59.0,40.0,1/1/2022,0.17
1,1/2/2022,43.2,32.0,14.7,29.5,12.8,18.0,240.0,0.0,0.0,...,37.0,4.0,16.0,29.67,29.27,35.7,51.0,52.0,1/2/2022,0.4
2,1/3/2022,25.7,60.0,12.7,29.7,8.3,12.2,290.0,0.0,0.0,...,6.0,8.0,35.0,30.23,29.26,25.3,38.0,41.0,1/3/2022,0.97
3,1/4/2022,9.3,67.0,0.1,30.4,2.9,4.5,47.0,0.0,0.0,...,0.0,7.0,35.0,30.57,30.23,12.7,20.0,32.0,1/4/2022,0.34
4,1/5/2022,23.5,30.0,-5.3,29.9,16.7,23.1,265.0,0.0,0.0,...,15.0,5.0,13.0,30.23,29.57,38.0,53.0,32.0,1/5/2022,0.66


In [3]:
# Shape of the data
weather.shape

(3934, 22)

# Check for Null Vales

In [4]:
# To find null values in the dataset
print(weather.isnull().any())

Date                         False
 Temperature                 False
 Average_humidity (%)        False
 Average_dewpoint (°F)       False
 Average_barometer (in)      False
 Average_windspeed (mph)     False
 Average_gustspeed (mph)     False
 Average_direction (°deg)    False
 Rainfall_for_month (in)     False
Rainfall_for_year (in)       False
 Maximum_rain_per_minute     False
 Maximum_temperature (°F)    False
 Minimum_temperature (°F)    False
 Maximum_humidity (%)        False
 Minimum_humidity (%)        False
 Maximum_pressure            False
 Minimum_pressure            False
 Maximum_windspeed (mph)     False
 Maximum_gust_speed (mph)    False
 Maximum_heat_index (°F)     False
Date1                        False
 diff_pressure               False
dtype: bool


In [5]:
# Check the null values and data types with pandas.info()
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3934 entries, 0 to 3933
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       3934 non-null   object 
 1    Temperature               3934 non-null   float64
 2    Average_humidity (%)      3934 non-null   float64
 3    Average_dewpoint (°F)     3934 non-null   float64
 4    Average_barometer (in)    3934 non-null   float64
 5    Average_windspeed (mph)   3934 non-null   float64
 6    Average_gustspeed (mph)   3934 non-null   float64
 7    Average_direction (°deg)  3934 non-null   float64
 8    Rainfall_for_month (in)   3934 non-null   float64
 9   Rainfall_for_year (in)     3934 non-null   float64
 10   Maximum_rain_per_minute   3934 non-null   float64
 11   Maximum_temperature (°F)  3934 non-null   float64
 12   Minimum_temperature (°F)  3934 non-null   float64
 13   Maximum_humidity (%)      3934 non-null   float

# Removing Duplicates and columns

In [6]:
# Dropping the duplicates in the given dataset
weather.drop_duplicates(keep="first", inplace=True)

In [7]:
# Check the values of Date fields are same or not
print(weather.loc[weather["Date"] != weather["Date1"]])

Empty DataFrame
Columns: [Date,  Temperature,  Average_humidity (%),  Average_dewpoint (°F),  Average_barometer (in),  Average_windspeed (mph),  Average_gustspeed (mph),  Average_direction (°deg),  Rainfall_for_month (in), Rainfall_for_year (in),  Maximum_rain_per_minute,  Maximum_temperature (°F),  Minimum_temperature (°F),  Maximum_humidity (%),  Minimum_humidity (%),  Maximum_pressure,  Minimum_pressure,  Maximum_windspeed (mph),  Maximum_gust_speed (mph),  Maximum_heat_index (°F), Date1,  diff_pressure]
Index: []

[0 rows x 22 columns]


In [8]:
# check the columns in the dataset
print(weather.columns)

Index(['Date', ' Temperature', ' Average_humidity (%)',
       ' Average_dewpoint (°F)', ' Average_barometer (in)',
       ' Average_windspeed (mph)', ' Average_gustspeed (mph)',
       ' Average_direction (°deg)', ' Rainfall_for_month (in)',
       'Rainfall_for_year (in)', ' Maximum_rain_per_minute',
       ' Maximum_temperature (°F)', ' Minimum_temperature (°F)',
       ' Maximum_humidity (%)', ' Minimum_humidity (%)', ' Maximum_pressure',
       ' Minimum_pressure', ' Maximum_windspeed (mph)',
       ' Maximum_gust_speed (mph)', ' Maximum_heat_index (°F)', 'Date1',
       ' diff_pressure'],
      dtype='object')


The date fields are the exact same, so Date1 should be removed.

Looking at the columns, I think I should also remove most of the min and max fields, as I want to look at the average (with exception to temperature, as I am interested in how that one specifically changes). Wind direction also appears to be something that does not change anything, as a gust of wind is a gust of wind from any direction.

I should also not use rainfall as it is here, as it builds until the end of the month/year to get the rain value. I am looking at this in a daily context, so if I want to use rainfall, I should get the difference per day in order to get the rainfall per day. That is not inherently useful either, so I will just drop it.

In [9]:
# Dropping the columns not required in the dataset
weatherClean = weather.drop([" Average_gustspeed (mph)", " Average_direction (°deg)", " Rainfall_for_month (in)", "Rainfall_for_year (in)",
                        " Maximum_rain_per_minute", " Maximum_humidity (%)", " Minimum_humidity (%)", " Maximum_pressure", 
                        " Minimum_pressure", " Maximum_windspeed (mph)", " Maximum_heat_index (°F)",
                        "Date1", " diff_pressure"], axis = 1)

weatherClean.head() #Take a peek at the dataset

Unnamed: 0,Date,Temperature,Average_humidity (%),Average_dewpoint (°F),Average_barometer (in),Average_windspeed (mph),Maximum_temperature (°F),Minimum_temperature (°F),Maximum_gust_speed (mph)
0,1/1/2022,37.8,35.0,12.7,29.7,26.4,40.0,34.0,59.0
1,1/2/2022,43.2,32.0,14.7,29.5,12.8,52.0,37.0,51.0
2,1/3/2022,25.7,60.0,12.7,29.7,8.3,41.0,6.0,38.0
3,1/4/2022,9.3,67.0,0.1,30.4,2.9,19.0,0.0,20.0
4,1/5/2022,23.5,30.0,-5.3,29.9,16.7,30.0,15.0,53.0


# Fixing the Labels

For this, I will have all the average as just the field name. For example, Average Temperature (°F) would become Temperature. The min and max temp will become minTemperature and maxTemperature respectively

In [10]:
current_labels = list(weatherClean.columns)

new_labels = ["Temperature", "Humidity", "Dewpoint", "Pressure", "Windspeed", "MaxTemperature", "MinTemperature", "MaxGustSpeed"]
num_labels = len(new_labels)

for i in range(0, num_labels):
    weatherClean = weatherClean.rename(columns= {current_labels[i + 1] : new_labels[i]})
    
weatherClean.head()

Unnamed: 0,Date,Temperature,Humidity,Dewpoint,Pressure,Windspeed,MaxTemperature,MinTemperature,MaxGustSpeed
0,1/1/2022,37.8,35.0,12.7,29.7,26.4,40.0,34.0,59.0
1,1/2/2022,43.2,32.0,14.7,29.5,12.8,52.0,37.0,51.0
2,1/3/2022,25.7,60.0,12.7,29.7,8.3,41.0,6.0,38.0
3,1/4/2022,9.3,67.0,0.1,30.4,2.9,19.0,0.0,20.0
4,1/5/2022,23.5,30.0,-5.3,29.9,16.7,30.0,15.0,53.0


In [11]:
print(weatherClean.dtypes)

Date               object
Temperature       float64
Humidity          float64
Dewpoint          float64
Pressure          float64
Windspeed         float64
MaxTemperature    float64
MinTemperature    float64
MaxGustSpeed      float64
dtype: object


# Set the Dates to Index

In [13]:
weatherClean["Date"] = pd.to_datetime(weatherClean["Date"]) #Ensure the date data is in datetime format
weatherClean.set_index("Date", inplace = True) #Set the date to the index

weatherClean.head(12) #Take a peek at the data

Unnamed: 0_level_0,Temperature,Humidity,Dewpoint,Pressure,Windspeed,MaxTemperature,MinTemperature,MaxGustSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-01,37.8,35.0,12.7,29.7,26.4,40.0,34.0,59.0
2022-01-02,43.2,32.0,14.7,29.5,12.8,52.0,37.0,51.0
2022-01-03,25.7,60.0,12.7,29.7,8.3,41.0,6.0,38.0
2022-01-04,9.3,67.0,0.1,30.4,2.9,19.0,0.0,20.0
2022-01-05,23.5,30.0,-5.3,29.9,16.7,30.0,15.0,53.0
2022-01-06,24.8,42.0,4.6,29.8,16.0,29.0,19.0,48.0
2022-01-07,34.2,60.0,21.6,29.7,20.4,39.0,27.0,54.0
2022-01-08,42.1,41.0,20.0,29.8,17.5,51.0,36.0,49.0
2022-01-09,30.3,46.0,11.4,30.0,6.9,41.0,19.0,36.0
2022-01-10,26.2,38.0,3.6,30.4,18.2,31.0,22.0,46.0


In [14]:
weatherClean.to_csv("weather_cleaned2.csv")