In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv(r'C:\Users\shive\OneDrive\Desktop\Projects\EXTRA\Python\weatherHistory.csv', index_col='Formatted Date')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\shive\\OneDrive\\Desktop\\Projects\\EXTRA\\Python\\weatherHistory.csv'

## Overview of the Data 

In [None]:
df.info()

In [None]:
# Dictionery of old and new column names
rename_dict = {
    'Summary': 'summary',
    'Precip Type': 'precip_type',
    'Temperature (C)': 'temp',
    'Apparent Temperature (C)': 'apparent_temp',
    'Humidity': 'humidity',
    'Wind Speed (km/h)': 'wind_speed',
    'Wind Bearing (degrees)': 'wind_bearing',
    'Visibility (km)': 'visibility',
    'Loud Cover': 'cloud_cover',
    'Pressure (millibars)': 'pressure',
    'Daily Summary': 'daily_summary'
}

# Rename columns 
df.rename(columns = rename_dict, inplace=True)

# Rename index name
df.index.names = ['date_time']

In [None]:
# Basic statistics on table column values 
df.describe()

In [None]:
print(df.isnull().sum())

In [None]:
# Find the null rows and the total rows
null_precip_type = df.precip_type.isnull().sum() 
total_rows = df.shape[0]

# Calculate and print the percentage of null rows in the precip_type column
percent_null = (null_precip_type / total_rows) * 100
print('Null prevalence in precip_type column: %.2f' %percent_null, '%')

In [None]:
df.precip_type.value_counts()

In [None]:
df.cloud_cover.value_counts()

In [None]:
# Dropping the cloud cover column
df.drop(columns=['cloud_cover'], inplace=True)

In [None]:
daily_summary_cat = df.daily_summary.value_counts().count()

print(f'Categories in the Daily Summary column: {daily_summary_cat}')

## Data Wrangling

In [None]:
# Set index to a Pandas datetime format
df.index = pd.to_datetime(df.index, utc=True)

In [None]:
# Scaling Humidity to percent and changing the data type to int8
df.humidity = (df.humidity * 100).astype('int8')


In [None]:
# Changing the precipitation type feature to a categorical data type 
df.precip_type = df.precip_type.astype('category')

## Numerical Distributions


In [None]:
num_cols = ['temp', 
            'apparent_temp', 
            'humidity', 
            'wind_speed', 
            'wind_bearing', 
            'visibility', 
            'pressure']

df[num_cols].hist(figsize=(10, 8))
plt.tight_layout()
plt.show()


## Categorical Analysis

In [None]:
sns.countplot(x='precip_type', data=df)
plt.title('Distribution of Precipitation Type')
plt.xlabel('Precipitation Type')
plt.show()


In [None]:
# Extracting the month feature from date time
df['month'] = df.index.month

# Grouping the features of interest by month
monthly_data = df.groupby('month')[[
    'temp',
    'humidity',
    'wind_speed',
    'wind_bearing',
    'visibility',
    'pressure'
]]

# Creating subplot layout
fig, ax = plt.subplots(2,3, figsize=(15,10))

# Main figure title
fig.suptitle('Year-over-Year Changes in Recorded Features')

# Temperature line plot
sns.lineplot(x='month', y='temp', data=df, color='tab:blue', ax=ax[0,0])
ax[0,0].set_ylabel('Temperature (C)')
ax[0,0].set_xlabel('Time (month)')

# Humidity line plot 
sns.lineplot(x='month', y='humidity', data=df, color='tab:red', ax=ax[0,1])
ax[0,1].set_ylabel('Humidity (%)')
ax[0,1].set_xlabel('Time (month)')

# Wind speed line plot
sns.lineplot(x='month', y='wind_speed', data=df, color='tab:purple', ax=ax[0,2])
ax[0,2].set_ylabel('Wind Speed (km/h)')
ax[0,2].set_xlabel('Time (month)')

# Wind bearing line plot
sns.lineplot(x='month', y='wind_bearing', data=df, color='tab:green', ax=ax[1,0])
ax[1,0].set_ylabel('Wind Bearing(degrees)')
ax[1,0].set_xlabel('Time (month)')

# Visibility line plot 
sns.lineplot(x='month', y='visibility', data=df, color='tab:pink', ax=ax[1,1])
ax[1,1].set_ylabel('Visibility (km)')
ax[1,1].set_xlabel('Time (month)')

# Pressure line plot
sns.lineplot(x='month', y='pressure', data=df, color='tab:orange', ax=ax[1,2])
ax[1,2].set_ylabel('Pressure (milibars)')
ax[1,2].set_xlabel('Time (month)')

In [None]:
# Extracting the year feature from date time 
df['year'] = df.index.year

# Grouping the features of interest by year
year_avg = df.groupby('year')[[
    'temp',
    'humidity',
    'wind_speed',
    'wind_bearing',
    'visibility',
    'pressure'
]]

# Creating subplot layout
fig, ax = plt.subplots(2,3, figsize=(15,10))

# Main figure title
fig.suptitle('Year-over-Year Changes in Recorded Features')

# Temperature line plot
sns.lineplot(x='year', y='temp', data=df, color='tab:blue', ax=ax[0,0])
ax[0,0].set_ylabel('Temperature (C)')
ax[0,0].set_xlabel('Time (month)')

# Humidity line plot 
sns.lineplot(x='year', y='humidity', data=df, color='tab:red', ax=ax[0,1])
ax[0,1].set_ylabel('Humidity (%)')
ax[0,1].set_xlabel('Time (month)')

# Wind speed line plot
sns.lineplot(x='year', y='wind_speed', data=df, color='tab:purple', ax=ax[0,2])
ax[0,2].set_ylabel('Wind Speed (km/h)')
ax[0,2].set_xlabel('Time (month)')


# Wind bearing line plot
sns.lineplot(x='year', y='wind_bearing', data=df, color='tab:green', ax=ax[1,0])
ax[1,0].set_ylabel('Wind Bearing(degrees)')
ax[1,0].set_xlabel('Time (month)')

# Visibility line plot 
sns.lineplot(x='year', y='visibility', data=df, color='tab:pink', ax=ax[1,1])
ax[1,1].set_ylabel('Visibility (km)')
ax[1,1].set_xlabel('Time (month)')

# Pressure line plot
sns.lineplot(x='year', y='pressure', data=df, color='tab:orange', ax=ax[1,2])
ax[1,2].set_ylabel('Pressure (milibars)')
ax[1,2].set_xlabel('Time (month)')


## Average wind speed for each direction:

In [None]:
# Select features for the correlation matrix
corr_attributes = [
    'temp',
    'apparent_temp',
    'humidity',
    'wind_speed',
    'wind_bearing',
    'visibility',
    'pressure'
]

# Create a data frame with correlations of selected features
corr_matrix = df[corr_attributes].corr()

# plot the correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='bone')