In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd 
import numpy as np

# Function to load data from different file formats

In [5]:
def file_path(path):
    file_name, file_extension = os.path.splitext(path)
    supported_formats = ['.csv', '.xlsx']
    
    if file_extension not in supported_formats:
        raise ValueError("Unsupported file format. Please provide a CSV or XLS/XLSX file.")
    
    if file_extension == '.csv':
        df = pd.read_csv(path)
    elif file_extension == '.xlsx':
        df = pd.read_excel(path)
    
    return df

data = file_path("International_Report_Passengers.csv")
data

Unnamed: 0,data_dte,Year,Month,usg_apt_id,usg_apt,usg_wac,fg_apt_id,fg_apt,fg_wac,airlineid,carrier,carriergroup,type,Scheduled,Charter,Total
0,05/01/2014,2014,5,14492,RDU,36,11032,CUN,148,19534,AM,0,Passengers,0,315,315
1,06/01/2007,2007,6,13204,MCO,33,16085,YHZ,951,20364,C6,0,Passengers,0,683,683
2,12/01/2005,2005,12,11433,DTW,43,10411,AUA,277,20344,RD,1,Passengers,0,1010,1010
3,04/01/2003,2003,4,13487,MSP,63,16304,ZIH,148,20204,MG,1,Passengers,0,508,508
4,12/01/2005,2005,12,12016,GUM,5,11138,CRK,766,20312,TZ,1,Passengers,0,76,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680980,10/01/2015,2015,10,12478,JFK,22,12972,LHR,493,19540,BA,0,Passengers,130217,0,130217
680981,09/01/2014,2014,9,12478,JFK,22,12972,LHR,493,19540,BA,0,Passengers,132052,0,132052
680982,10/01/2014,2014,10,12478,JFK,22,12972,LHR,493,19540,BA,0,Passengers,132822,0,132822
680983,08/01/2014,2014,8,12478,JFK,22,12972,LHR,493,19540,BA,0,Passengers,134263,0,134263


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680985 entries, 0 to 680984
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   data_dte      680985 non-null  object
 1   Year          680985 non-null  int64 
 2   Month         680985 non-null  int64 
 3   usg_apt_id    680985 non-null  int64 
 4   usg_apt       680985 non-null  object
 5   usg_wac       680985 non-null  int64 
 6   fg_apt_id     680985 non-null  int64 
 7   fg_apt        680985 non-null  object
 8   fg_wac        680985 non-null  int64 
 9   airlineid     680985 non-null  int64 
 10  carrier       678188 non-null  object
 11  carriergroup  680985 non-null  int64 
 12  type          680985 non-null  object
 13  Scheduled     680985 non-null  int64 
 14  Charter       680985 non-null  int64 
 15  Total         680985 non-null  int64 
dtypes: int64(11), object(5)
memory usage: 83.1+ MB


In [7]:
data.isna().sum()

data_dte           0
Year               0
Month              0
usg_apt_id         0
usg_apt            0
usg_wac            0
fg_apt_id          0
fg_apt             0
fg_wac             0
airlineid          0
carrier         2797
carriergroup       0
type               0
Scheduled          0
Charter            0
Total              0
dtype: int64

# preprocessing the data

In [9]:
def preprocess(df):
    # Imputing missing values using median
    numerical_columns = df.select_dtypes(include=['number']).columns
    if not numerical_columns.empty:
        # If there are numerical columns
        df.fillna(df.median(numeric_only=True), inplace=True)
        
        # Normalize numerical columns using Min-Max Scaling (normalization)
        for column in numerical_columns:
            df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    
    # Encoding categorical columns using one-hot encoding
    categorical_columns = df.select_dtypes(include=['object']).columns
    if not categorical_columns.empty:
        # If there are categorical columns
        df[categorical_columns] = df[categorical_columns].fillna("unknown")
        df = pd.get_dummies(df, columns=categorical_columns)
    
    return df.head()

data_processed = preprocess(data)

In [10]:
data.isna().sum()

data_dte        0
Year            0
Month           0
usg_apt_id      0
usg_apt         0
usg_wac         0
fg_apt_id       0
fg_apt          0
fg_wac          0
airlineid       0
carrier         0
carriergroup    0
type            0
Scheduled       0
Charter         0
Total           0
dtype: int64

# Function to generate visualization dashboard

In [13]:
def generate_visualizations(data):
    # Generate histograms
    for column in data.columns:
        plt.figure()
        sns.histplot(data[column])
        plt.title(f"Histogram of {column}")
        plt.show()

  # Generate box plots
    for column in data.columns:
        plt.figure()
        sns.boxplot(data=data[column])
        plt.title(f"Box plot of {column}")
        plt.show()

  # Generate scatter plots
    for column1 in data.columns:
        for column2 in data.columns:
            if column1 != column2:
                plt.figure()
                sns.scatterplot(data=data, x=column1, y=column2)
                plt.title(f"Scatter plot of {column1} vs {column2}")
                plt.show()

    # Generate interactive visualizations using Plotly
    fig = px.scatter_matrix(data)
    fig.show()



# Main function

In [None]:
def main():
    # Load data

    # Preprocess data
    preprocessed_data = preprocess(data)

    # Generate visualization dashboard
    generate_visualizations(preprocessed_data)

# Run the main function
if __name__ == '__main__':
    main()