#Part 1 - Automed EDA for individual raw datasets

In [None]:
###Created by Diogo Loureiro - automated EDA HTML reports for each dataset###
#To use this script, please execute following steps:
    #1) Install requirements.txt
    #2) Save desired .csv files on data folder
    #3) Run this script
    #4) Reports will be generated inside reports folder (.html format)

#Documentation for both libs (pandas profilling and sweet viz
#Pandas profilling = https://ydata-profiling.ydata.ai/docs/master/
#Sweetviz = https://pypi.org/project/sweetviz/#description

import os
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import sweetviz as sv

#Generate sweetviz version? If not, change to False
sweetviz=False

# Get the current working directory
directory = os.getcwd()
print(directory)

# Create the 'reports' folder if it doesn't exist
reports_directory = os.path.join(directory, 'reports/Diogo Loureiro')
if not os.path.exists(reports_directory):
    os.makedirs(reports_directory)

# Set the 'data' directory
data_directory = os.path.join(directory, 'data')

# Iterate over all files in the 'data' directory
for filename in os.listdir(data_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_directory, filename)

        # Load the dataset
        data = pd.read_csv(file_path)

        # Generate the EDA report using pandas-profiling
        profile = ProfileReport(data,title=filename + ' Basic EDA',explorative=True,tsmode=True)

        #Check if sweetviz option is enabled
        if sweetviz==True:
            sv_profile = sv.analyze(data)

        # Save the report as an HTML file inside the 'reports' folder
        report_filename = os.path.splitext(filename)[0] + '_report.html'
        report_path = os.path.join(reports_directory, report_filename)
        sv_report_path = os.path.join(reports_directory, 'sv_'+report_filename)
        profile.to_file(report_path)

        #Check if sweetviz option is enabled
        if sweetviz==True:
            sv_profile.show_html(sv_report_path,open_browser=False)

        print("Report(s) saved at:", report_path)

#Part 2- EDA for individual transformed + combined datasets

In [73]:
#Import libs#
import pandas as pd
import plotly.express as px
# Set display option to show all columns
pd.set_option('display.max_columns', None)

In [127]:
#Load all datasets and transform data (based on html reports)

#Passengers entrance per station
df_pes= pd.read_csv('data/alllines_pes_complete.csv',parse_dates=['date'],dtype={'line':'category','station':'category','dpea':float},usecols=lambda column: column != 'Unnamed: 0')
#Add total no. of stations
df_pes = df_pes.groupby(['date', 'line']).agg(stations=('station', lambda x: x.nunique()), total_dpea=('dpea', 'sum')).reset_index()

#Passengers transported per line
df_ptl = pd.read_csv('data/alllines_ptl_complete.csv',parse_dates=['year_month'],dtype={'line':'category'}).rename(columns={'year_month':'date','total':'PTL_total',
                                                                                                                            'MDU (Business Days Mean)':'PTL_MDU (Business Days Mean)',
                                                                                                                            'MSD (Saturdays Mean)':'PTL_MSD (Saturdays Mean)',
                                                                                                                            'MDO (Sundays Mean)':'PTL_MDO (Sundays Mean)',
                                                                                                                            'MAX (Daily Max)':'PTL_MAX (Daily Max)'
                                                                                                                           })

#Interval per train
df_ibt= pd.read_csv('data/publiclines_ibt_complete.csv',parse_dates=['date'],dtype={'line':'category','dpea':'interval'},usecols=lambda column: column != 'Unnamed: 0')

#Passengers entrance per line
df_pel= pd.read_csv('data/publiclines_pel_complete.csv',parse_dates=['date'],dtype={'line':'category'}).rename(columns={'year_month':'date','total':'PEL_total',
                                                                                                                            'business_day_mean':'PEL_business_day_mean',
                                                                                                                            'saturday_mean':'PEL_saturday_mean',
                                                                                                                            'sunday_mean':'PEL_sunday_mean',
                                                                                                                            'max':'PEL_max'
                                                                                                                           })


In [135]:
# Merge the datasets on passengers transported per line
df_merged = df_ptl.merge(df_pes, on=['date', 'line'], how='left')
df_merged = df_merged.merge(df_ibt, on=['date', 'line'], how='left')
df_merged = df_merged.merge(df_pel, on=['date', 'line'], how='left')
df_merged['line']=df_merged['line'].astype('category')
df_merged.head()

Unnamed: 0,date,line,PTL_total,PTL_MDU (Business Days Mean),PTL_MSD (Saturdays Mean),PTL_MDO (Sundays Mean),PTL_MAX (Daily Max),stations,total_dpea,interval,PEL_total,PEL_business_day_mean,PEL_saturday_mean,PEL_sunday_mean,PEL_max
0,2018-08-01,3,38679000.0,1440000.0,884000.0,504000.0,1483000.0,,,,32413000.0,1211000.0,724000.0,418000.0,1248000.0
1,2018-08-01,1,37775000.0,1420000.0,832000.0,446000.0,1466000.0,,,,27733000.0,1048000.0,598000.0,307000.0,1086000.0
2,2018-08-01,15,516000.0,22000.0,,2000.0,23000.0,,,,,,,,
3,2018-08-01,2,18164000.0,701000.0,318000.0,194000.0,723000.0,,,,14509000.0,560000.0,250000.0,155000.0,581000.0
4,2018-08-01,5,7220000.0,309000.0,176000.0,85000.0,322000.0,,,,,,,,


In [136]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 341
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          342 non-null    datetime64[ns]
 1   line                          342 non-null    category      
 2   PTL_total                     339 non-null    float64       
 3   PTL_MDU (Business Days Mean)  339 non-null    float64       
 4   PTL_MSD (Saturdays Mean)      337 non-null    float64       
 5   PTL_MDO (Sundays Mean)        339 non-null    float64       
 6   PTL_MAX (Daily Max)           339 non-null    float64       
 7   stations                      168 non-null    float64       
 8   total_dpea                    168 non-null    float64       
 9   interval                      112 non-null    float64       
 10  PEL_total                     223 non-null    float64       
 11  PEL_business_day_mean         22

In [None]:
from ydata_profiling import ProfileReport
import sweetviz as sv

#Generate sweetviz version? If not, change to False
sweetviz=False

# List of DataFrames with their corresponding names
dataframes = [
    (df_pes, 'Passengers entrance per station'),
    (df_ptl, 'Passengers transported per line'),
    (df_ibt, 'Interval per line'),
    (df_pel, 'Passengers entrances per line'),
    (df_merged, 'SP Metro dataset')
]

# Iterate over the DataFrames and generate reports
for df, name in dataframes:
    # Perform any required transformations or data processing on the DataFrame
    
    # Generate the profile report
    profile = ProfileReport(df, title='Basic EDA - ' + name, tsmode=True)

    #Check if sweetviz option is enabled
    if sweetviz==True:
        sv_profile = sv.analyze(df)
     
    # Save the report as an HTML file
    report_file_path = 'reports/Diogo Loureiro/' + name + ' report.html'
    profile.to_file(report_file_path)

    #Check if sweetviz option is enabled
    if sweetviz==True:
        sv_report_file_path = 'reports/Diogo Loureiro/SV_' + name + ' report.html'
        sv_profile.show_html(sv_report_file_path,open_browser=False)
        
    print('Report generated:', report_file_path)

In [138]:
#Export parquet file
df_merged.to_parquet('data/sp_metro_data.parquet')

In [139]:
# Plot the time series with different colors for each line
fig = px.line(df_merged, x='date', y='PTL_total', color='line',width=800, height=500)
# Show the plot
fig.show()