# Capstone Project: Saudi Arabia Climate Observation 

by VISION Team

## Importing Packages

In [None]:
#import the usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#from datetime import datetime, timezone
#import time

## Loading Dataset

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#Loading  the data
#df = pd.read_csv('/content/drive/MyDrive/saudi-hourly-weather-data.csv',delimiter = ';')
df = pd.read_csv('SaudiWeather.csv')

df.head()

## Data Visualization

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#define color 
c= ['#FE3600','#fa894d','#fce760','#009d77','#b1eaeb']

### 1st Plot: Air Temperature Categories Distribution

In [None]:
#Set fig size
fig = plt.figure(figsize =(10, 7))

# set the labels of figure 
lable=["hot","comfortable ","warm","cold"]

# Add title
plt.title("Air Temperature Categories Distribution",fontsize=18,weight = 'bold')


data = df['air_temperature_categories'].value_counts().nlargest(4)

colors = sns.color_palette(c)

plt.pie(data.values, labels=lable, colors=colors, autopct='%1.1f%%',explode = [.01,.01,.01,.01])
plt.show()
fig.savefig('1.jpg')

The figure above shows the air_temperature_categories which are hot,comfortable, warm and cold from 2017 till 2022,
in Saudi Arabia, the climate is generally desert becauce this we note the trending categories is hot ,then comfortable comes the second category while the cold the last.

### 2nd Plot: Wind Type Distribution

In [None]:
Wind_Distribution=df['wind_type'].value_counts().sort_values(ascending=False)
Wind_Distribution

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(6,6))

# Add title
plt.title("Wind Type Distribution",fontsize=18,weight = 'bold')

sns.barplot(x=Wind_Distribution.index,y=Wind_Distribution, palette=c, ci=False,edgecolor="black")

#Rotate x-labels, otherwise it's utterly hectic
plt.xticks(rotation=90)
plt.ylabel("count")
plt.savefig('Wind Type Distribution.jpg')

According to the registered observations in 2017 to 2022, 99% the wind type is normal and only 1% represented the wind is varible.

### 3rd Plot: Year Distribution

In [None]:
year_Distribution=df['YEAR'].value_counts().sort_values(ascending=False)
year_Distribution

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(8,6))

# Add title
plt.title("Year Distribution",fontsize=18,weight = 'bold')

sns.barplot(x=year_Distribution.index,y=year_Distribution, palette=c, ci=False,edgecolor="black")

#Rotate x-labels, otherwise it's utterly hectic
plt.xticks(rotation=90)

plt.ylabel("count");
plt.savefig('Year Distribution.jpg')

Our Dataset contains 1,865,808 records registered around the five years ago  from 2017 to 2022 
and from this plot we noted the 2021 has the highest observation's records by 326,711 and the followed by the year 2022 with 321,873 accordint to KAPSARC data portal.

### 4th Plot: Station Country Distribution

In [None]:
station_Distribution=df['station_name'].value_counts().sort_values(ascending=False)
station_Distribution

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(15,13))

# Add title
plt.title("Station Country Distribution",fontsize=18,weight = 'bold')

sns.barplot(x=station_Distribution,y=station_Distribution.index, palette=c, ci=False,edgecolor="black")

#Rotate x-labels, otherwise it's utterly hectic
plt.xticks(rotation=90)

plt.ylabel("count");
plt.savefig('Station Country Distribution.jpg')

From the chart above, we counted the number of registered observations and we see Qassim has the highest number of records ,then King Abdullah Bin Abddlaziz station comes the next, afterthat,KFIA in Dammam the last. 

### 5th Plot: Humidity Levels Distribution

In [None]:
humidity_Distribution=df['humidity_level'].value_counts().sort_values(ascending=False)
humidity_Distribution

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(13,13))

# Add title
plt.title("Humidity Levels Distribution",fontsize=18,weight = 'bold')

sns.barplot(x=humidity_Distribution.index,y=humidity_Distribution, palette=c, ci=False,edgecolor="black")

#Rotate x-labels, otherwise it's utterly hectic
plt.xticks(rotation=0)

plt.ylabel("count");
plt.savefig('Humidity Levels Distribution.jpg')

According to all stations country and the five years ago we see the most humidity levels is 25% and lower while the lowest level is 73% and higher.

### 6th Plot: Cloud And Visibility OK Distribution

In [None]:
cloud_visibility_Distribution=df['sky_cavok'].value_counts().sort_values(ascending=False)
cloud_visibility_Distribution

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(6,6))

# Add title
plt.title("Cloud And Visibility OK Distribution",fontsize=18,weight = 'bold')

sns.barplot(x=cloud_visibility_Distribution.index,y=cloud_visibility_Distribution, palette=c, ci=False,edgecolor="black")

#Rotate x-labels, otherwise it's utterly hectic
plt.xticks(rotation=0)

plt.ylabel("count");
plt.savefig('Cloud And Visibility OK Distribution.jpg')

The above plot shows the sky_cavok which means the registered observation is Cloud And Visibility OK or not?, 
we see about 80% of cloud and visibility isn't ok while 20% was ok.

### 7th Plot: Total Observations in each regions in 2021

In [None]:
# i need all observations that registered in 2021 to determine it on the map
df_2021 = df[df['YEAR'] == 2021]

In [None]:
import folium 
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import math


m = folium.Map(location=[23.885942,45.079162], zoom_start=13)
 
#Add Markers to the map
cluster = MarkerCluster()
# for loop to more than point
for idx, row in df_2021.iterrows():
    
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        
        cluster.add_child(Marker([row['latitude'], row['longitude']]))

m.add_child(cluster)

This map shows the total observations in each regions, as you can see, in 2021, the Southern Region has the highest number of records while Hail has the lowest number.

### 8th Plot: Air Temperature Variations over Years

In [None]:
# display pivot table for air_temperature through all years

df.pivot_table('air_temperature', index='hour', columns='YEAR').plot(figsize=(14,6))
plt.title("Air Temperature Variations over Years",fontsize=18,weight = 'bold')
plt.ylabel('air_temperature in degree CELSIUS');
plt.savefig('Air Temperature Variations over Years.jpg')

The chart above clarify the 5 years and how the air temperature differ from year to year, In 2021, the temperature was the highest, while in the year 2020, the temperature was almost the lowest

### 9th Plot: Wind Speed Variations over Years

In [None]:
# display pivot table for air_temperature through all years

df.pivot_table('wind_speed_rate', index='hour', columns='YEAR').plot(figsize=(14,6))
plt.title("Wind Speed Variations over Years",fontsize=18,weight = 'bold')
plt.ylabel('Wind Speed in m/s');
plt.savefig('Wind Speed Variations over Years.jpg')

The above plot display the wind speed variations over five years the wind speed was almost the same the year 2018 and 2017 while reached to the lowest speed in year 2022.

### 10th Plot: Air Temperature Variations over Month

In [None]:
# display pivot table for air_temperature through all month

df.pivot_table('air_temperature', index='hour', columns='month').plot(figsize=(14,6))
plt.title("Air Temperature Variations over Month",fontsize=18,weight = 'bold')
plt.ylabel('Air temperature in degree CELSIUS');
plt.savefig('Air Temperature Variations over Month.jpg')

The above chart, show the variations between air temperatures and months during the year 2017 to 2022, 
we note the months 12,1 and 2 has the lowest temperature which means we are in the winter season, while the highest temperature was in the summer season which are 6,7 and 8.

### 11th Plot: Air Temperature Categories Per Year

In [None]:
plt.figure(figsize = (14, 6))
ax = sns.countplot(x="YEAR", hue="air_temperature_categories",palette=c, data=df.sort_values(by='YEAR'))
ax.legend(loc='upper left', frameon=True, labels=["hot","comfortable ","warm","cold"])

for p in ax.patches:
    ax.annotate(format(p.get_height()),
                (p.get_x()+p.get_width()/2., p.get_height()-4),
                ha = 'center', va = 'center',
                xytext = (0, 10), textcoords = 'offset points')

_ = ax.set_title("Air Temperature Categories Per Year",fontsize=18,weight = 'bold')
plt.savefig('Air Temperature Categories Per Year.jpg')

For more details, we classify the years according to air temperature categories, we see how the temperature increases every year due to climate change.


### 12th Plot: Wind Speed Rate According To Air Temperature Categories

In [None]:
wind_speed=df['wind_speed_rate'].value_counts().sort_values(ascending=False).head(10)
wind_speed

In [None]:
# Smoker count according to Diabetes statues
# Creating plot

#Set fig size
fig, ax = plt.subplots(figsize=(13,8))

#plotting bar-plot
#ax=sns.countplot(x='wind_speed', data=df, hue= 'air_temperature_categories', palette=c,
               # edgecolor="black")
sns.countplot(x='wind_speed_rate',hue='air_temperature_categories',data=df,palette=c,order=df['wind_speed_rate'].value_counts().nlargest(10).index)

#Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=80)

# Add title
plt.title('Wind Speed Rate According To Air Temperature Categories',fontsize=18,weight = 'bold')

#Change leganed labels
plt.legend(labels=["hot","comfortable ","warm","cold"])

# show plot
plt.savefig('Wind Speed Rate According To Air Temperature Categories.jpg');

For more details about wind speeds and air temperature categories, we notice how the wind speed decreases every year from 2.1 m/s to 1.0 m/s.


### 13th Plot: Correlation Matrix Between Featuers

In [None]:
#Set fig size
fig, ax = plt.subplots(figsize=(25,15)) 

# Creating plot
corr = df.corr()

#plotting heatmap-plot
sns.heatmap(corr,annot=True, cmap = c, linewidth = 0.30)

# Add title
plt.title("Correlation matrix between features",fontweight="bold",fontsize=20)

# show plot
plt.show()
fig.savefig('Correlation matrix.jpg')

After preprocessing, we selected the most important features to build our Regression Machine Learning Models and this plot to clarify the Correlation between the target 'air_temperature' and other features. 