#### **Preliminar parameters**

Importation of Libraries

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
from io import BytesIO

Importation of Dataset and Creation of Spark Session :

In [72]:
data = pd.read_csv("datasets/north_america_weather_2016_final.csv", sep=",")

---

### **Graph for Dashbord**

---

#### **I - First Graph**

Barchart on the year for a `feature` and an `airport`

In [39]:
def generate_bar_chart(airport_code, feature):
    data_filtered = data[(data['airport_code'] == airport_code) & (data[feature].notnull())]
    data_filtered['date'] = pd.to_datetime(data_filtered['date'])
    data_filtered = data_filtered.set_index('date').sort_index()

    # Creating interactive graphics with Plotly
    fig = px.bar(data_filtered, x=data_filtered.index, y=feature, 
                 title=f'{feature.capitalize()} for {airport_code}',
                 labels={'x': 'Date', 'y': feature})
    fig.update_xaxes(tickangle=45)
    
    # Change the color of bars and chart background
    fig.update_traces(marker_color='darkviolet')
    fig.update_layout(plot_bgcolor='white')

    fig.show()

generate_bar_chart('DWH', 'wind_speed')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### **II - Second Graph**

Top X of `feature` for `airport`

In [73]:
def plotTopX(feature, x=5, ascending=False):
    # Group by 'airport_code' and select the X highest or lowest values
    # according to feature
    grouped_data = data.groupby('airport_code').agg({feature: 'max'}).reset_index()
    sorted_data = grouped_data.sort_values(by=feature, ascending=ascending).head(x)
    
    # Retrieve data for selected airports
    top_x = data[data['airport_code'].isin(sorted_data['airport_code'])]

    # Sort data in descending order
    top_x = top_x.sort_values(by=feature, ascending=not ascending)

    # Creating interactive graphics with Plotly
    fig = px.bar(top_x, x='airport_code', y=feature, 
                 title=f'Top {x} airports with {"lowest" if ascending else "highest"} {feature}',
                 labels={'airport_code': 'Airport Code', 'y': feature})
    
    # Modify the angle of x-axis labels
    fig.update_xaxes(tickangle=45)
    
    # Change the color of bars and chart background
    fig.update_traces(marker_color='darkviolet')
    fig.update_layout(plot_bgcolor='white')

    fig.show()

plotTopX('wind_speed', x=5)

#### **III - Third Graph**

Pie Chart for Precipitation

In [47]:
def generate_pie_chart_precipitation(airport_code):
    data_filtered = data[(data['airport_code'] == airport_code) & (data['liquid_precipitation_hourly'].notnull())]
    
    # Exclude NaN columns from 'liquid_precipitation_hourly'
    data_filtered = data_filtered.dropna(subset=['liquid_precipitation_hourly'])
    
    if data_filtered.empty:
        print(f"No data available for {airport_code} or there are no valid values in 'liquid_precipitation_hourly'.")
        return
    
    labels = ['No Precipitation', 'Light', 'Moderate', 'Heavy']
    bins = [0, 0.1, 2.5, 7.6, float('inf')]
    
    # Categorizing precipitation data
    data_filtered['precipitation_category'] = pd.cut(data_filtered['liquid_precipitation_hourly'], bins=bins, labels=labels)
    grouped_data = data_filtered['precipitation_category'].value_counts().reset_index()
    grouped_data.columns = ['precipitation_category', 'count']

    # Creating pie charts with Plotly
    fig = px.pie(grouped_data, values='count', names='precipitation_category', 
                 title=f'Hourly Precipitation Rates for {airport_code}',
                 labels={'precipitation_category': 'Precipitation Category'})

    fig.show()

generate_pie_chart_precipitation('DWH')

#### **IV - Fourth Graph**

Pressure on time plot for an `airport`

In [48]:
def plot_pressure_time(airport_code):
    data_filtered = data[data['airport_code'] == airport_code]
    data_filtered['date'] = pd.to_datetime(data_filtered['date'])

    # Creating interactive graphics with Plotly
    fig = px.scatter(data_filtered, x='date', y='atmospheric_pressure_1', 
                     title=f'Atmospheric Pressure (scatter plot) over Time for {airport_code}',
                     labels={'date': 'Date', 'atmospheric_pressure_1': 'Atmospheric Pressure (1)'})

    fig.update_xaxes(tickangle=45)
    fig.show()

plot_pressure_time('DWH')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### **V - Fifth Graph**

Visibility over the time for an `airport`

In [49]:
def plot_visibility_over_time(airport_code):
    data_filtered = data[(data['airport_code'] == airport_code) & (data['visibility_observation'].notnull())]
    
    if data_filtered.empty:
        print(f"No data available for {airport_code} or there are no valid values for 'visibility_observation'.")
        return
    
    data_filtered['date'] = pd.to_datetime(data_filtered['date'])
    data_filtered = data_filtered.sort_values('date')

    # Creating interactive graphics with Plotly
    fig = px.line(data_filtered, x='date', y='visibility_observation',
                  title=f'Visibility Over Time for {airport_code}',
                  labels={'date': 'Date', 'visibility_observation': 'Visibility'})

    fig.update_traces(mode='markers+lines', marker=dict(symbol='circle', size=8))
    fig.update_xaxes(tickangle=45)
    fig.show()

plot_visibility_over_time('DWH')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### **VI - Sixth Graph**

Precipitation distribution per mounth

In [50]:
def precipitation_distribution_by_month(airport_code):
    # Filter data for specified airport
    data_filtered = data[data['airport_code'] == airport_code]
    
    if data_filtered.empty:
        print(f"No data available for {airport_code}.")
        return
    
    # Ensure date data is in datetime format
    data_filtered['date'] = pd.to_datetime(data_filtered['date'])
    
    # Extract month
    data_filtered['month'] = data_filtered['date'].dt.month_name()
    
    # Categorize precipitation (rain, snow, etc.) by month
    categories = ['No Precipitation', 'Light', 'Moderate', 'Heavy']
    bins = [0, 0.1, 2.5, 7.6, data_filtered['liquid_precipitation_hourly'].max()]
    data_filtered['precipitation_category'] = pd.cut(data_filtered['liquid_precipitation_hourly'], bins=bins, labels=categories)
    
    # Group by month and precipitation category and count occurrences
    grouped_data = data_filtered.groupby(['month', 'precipitation_category']).size().unstack().fillna(0)
    
    # Creating a stacked bar chart with Plotly
    fig = px.bar(grouped_data, barmode='stack',
                 labels={'month': 'Month', 'value': 'Count', 'precipitation_category': 'Precipitation Category'},
                 title=f'Precipitation Distribution by Month for {airport_code}')

    fig.update_layout(xaxis={'categoryorder':'total ascending'}, legend_title='Precipitation Category')
    fig.update_xaxes(tickangle=45)

    fig.show()

precipitation_distribution_by_month('JFK')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### **VII - Seventh Graph**

It is possible to found detail of sky codes here :
https://www.nodc.noaa.gov/archive/arc0021/0002199/1.1/data/0-data/HTML/WMO-CODE/WMO4677.HTM

Double Distribution of Sky Condition for an `airport` for specific `month`

In [71]:
def sky_condition_distribution_for_month(airport_code, month):
    # Filter data for specified airport and month
    data_filtered = data[(data['airport_code'] == airport_code) & (pd.to_datetime(data['date']).dt.month == pd.to_datetime(month).month)]
    
    if data_filtered.empty:
        print(f"No data available for {airport_code} in {month}.")
        return
    
    # Group by weather condition
    grouped_data_1 = data_filtered.groupby('sky_condition_1').size().fillna(0).reset_index(name='count')
    grouped_data_2 = data_filtered.groupby('sky_condition_2').size().fillna(0).reset_index(name='count')

    # Create two side-by-side bar graphs for each sky weather condition with Plotly
    fig = px.bar(grouped_data_1, x='sky_condition_1', y='count',
                 labels={'sky_condition_1': 'Sky Condition 1', 'count': 'Count'},
                 title=f'Sky Condition 1 Distribution for {airport_code} in {month}')

    fig.update_xaxes(tickangle=90)
    fig.show()

    fig = px.bar(grouped_data_2, x='sky_condition_2', y='count',
                 labels={'sky_condition_2': 'Sky Condition 2', 'count': 'Count'},
                 title=f'Sky Condition 2 Distribution for {airport_code} in {month}')

    fig.update_xaxes(tickangle=90)
    fig.show()

sky_condition_distribution_for_month('JFK', '2016-06-05')

#### **VIII - Eight Graph**

It is possible to found detail of sky codes here :
https://www.nodc.noaa.gov/archive/arc0021/0002199/1.1/data/0-data/HTML/WMO-CODE/WMO4677.HTM

Violin Graph for Sky Condition for an `airport`

In [62]:
def plot_sky_condition_violinplot(airport_code):
    # Filter data for specified airport
    data_filtered = data[data['airport_code'] == airport_code]

    if data_filtered.empty:
        print(f"No data available for {airport_code}.")
        return

    # Select 'sky_condition_1' and 'sky_condition_2' columns
    sky_conditions = ['sky_condition_1', 'sky_condition_2']

    # Create two violin plots for the 'sky_condition_1' and 'sky_condition_2' columns
    fig = px.violin(data_filtered.melt(id_vars='date', value_vars=sky_conditions, var_name='Sky_Condition', value_name='Condition'),
                    y='Condition', box=True, points="all", 
                    color='Sky_Condition', 
                    title=f'Sky Condition Distribution for Airport {airport_code}',
                    labels={'Condition': 'Sky Condition', 'Sky_Condition': 'Sky Category'})

    fig.update_layout(violinmode='group', xaxis_title='Sky Category', yaxis_title='Sky Condition')
    fig.show()

plot_sky_condition_violinplot('JFK')