### Dependencies

In [1]:
# File Management libraries
import tarfile
import urllib.request
from zipfile import ZipFile
from pathlib import Path

# Data Analytics libraries
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# ML libraries
import numpy as np

# Math & Statistics
from math import ceil

# Custom libraries
import utils as ut

### Download Housing Data

This function downloads the Housing data zip file from a GitHub repository, create the 'data' directory if not exists and finally extracts the content from the zip file.

In [16]:
def downloadHousingData():

    # Defining the zip path and the csv path
    zipfilePath = Path('data/housing.tgz')
    housingPath = Path('data/housing.csv')

    # Checking if the zip file not exists
    if not zipfilePath.is_file():

        # Making the data/ directory
        Path('data').mkdir(parents=True, exist_ok=True)
        print('data/ directory created')

        # Setting the URL for downloading the data
        URL = 'https://github.com/ageron/data/raw/main/housing.tgz'

        # Downloading the data from the URL into the zip path
        urllib.request.urlretrieve(URL, zipfilePath)
        print(f'data downloaded from {URL}')

    # Checking if the csv file not exists
    if not housingPath.is_file():

        # Open the zip file and extract the content
        with tarfile.open(zipfilePath) as housing_zip:
            housing_zip.extractall(path='data')
        print('Content Extracted into data/')

        # Move the csv file to the data directory
        csvPath = Path('data/housing/housing.csv')
        csvPath.rename(Path('data/housing.csv'))

        # Remove the housing/ directory from data/
        housingDirPath = Path('data/housing')
        housingDirPath.rmdir()

### Download California Map

Downloads the California map .shp file, create the 'data' directory if not exists and finally extracts the content from the zip file.

In [None]:
def downloadCaliforniaMap():

    # Defining the zip path and the map path
    zipfilePath = Path('data/California_County_Boundaries.zip')
    mapPath = Path('data/California_County_Boundaries.shp')

    # Checking if the zip file not exists
    if not zipfilePath.is_file():

        # Making the data/ directory
        Path('data').mkdir(parents=True, exist_ok=True)
        print('data/ directory created')

        # Setting the URL for downloading the data
        URL = 'https://gis-calema.opendata.arcgis.com/datasets/59d92c1bf84a438d83f78465dce02c61_0.zip?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D'
        
        # Downloading the data from the URL into the zip path
        urllib.request.urlretrieve(URL, zipfilePath)
        print(f'Zip file downloaded from {URL}')

    # Checking if the map file not exists
    if not mapPath.is_file():

        # Open the zip file and extract the content
        with ZipFile(zipfilePath, 'r') as map_zip:
            map_zip.extract('California_County_Boundaries.shp', path='data')
        print('Map Extracted into data/')

### Save Graph

This function downloads a graph generated by matplotlib with the specified name, graph extension and resolution.

In [None]:
def saveGraph(graph_id:str, tight_layout=True, graph_extension='png', resolution=300):

    # Creating the image directory if not exists
    imagesPath = Path('images')
    imagesPath.mkdir(parents=True, exist_ok=True)

    # Defining the path for saving the graphs
    graphPath = imagesPath/f'{graph_id}.{graph_extension}'

    # Checking if the tight_layout is True
    if tight_layout:
        plt.tight_layout()
    
    # Saving the graph with the name, format and dpi resolution
    plt.savefig(graphPath, format=graph_extension, dpi=resolution)

### Random Data Splitter

This function splits a dataframe randomly into a test and train set for ML models.

In [None]:
def randomDataSplitter(data, test_ratio=0.2):

    # Creates the list of indexes and calculates the number of rows for the test set
    indexList = np.random.permutation(len(data))
    testSize = int(len(data) * test_ratio)

    # Creates the test and train list of indexes based on the test size
    testIndexes = indexList[:testSize]
    trainIndexes = indexList[testSize:]

    # Creates the test and train sets based on the original dataframe
    testSet = data.iloc[testIndexes]
    trainSet = data.iloc[trainIndexes]

    return trainSet, testSet

### Thousands Format

This function format an integer number with the thousands format

In [2]:
def thousandsFormat(x):
    return "{:,}".format(int(x))

### Designed Bar Graph

This function creates a minimalistic and visually appealing bar graph.

In [1]:
def designedBarGraph(filename, labels, values, tittle, xlabel, ylabel, color='#329D9C'):

    # Creating the graph axis and figure elements
    fig, ax = plt.subplots(figsize=(5, 5))

    # Creating the bar graph
    ax.bar(labels, values, color=color)

    # Grid configuration
    ax.grid(True, color='grey', linewidth='0.5', axis='y', alpha=0.3)

    # Setting grid below the graph
    ax.set_axisbelow(True)

    # Set the title aligned to the left
    ax.set_title(tittle, loc='center', pad=15, weight='bold', fontsize=10, fontfamily='arial')

    # Adding labels to the axis
    ax.set_xlabel(xlabel, labelpad=10, fontsize=9, fontfamily='arial')
    ax.set_ylabel(ylabel, labelpad=10, fontsize=9, fontfamily='arial')

    # Removing ticks from x, y axis
    ax.tick_params(axis='both', which='both', bottom=False, left=False)

    # Setting ticks labels font size and family
    ax.tick_params(axis='x', labelsize=8, labelfontfamily='arial')
    ax.tick_params(axis='y', labelsize=8, labelfontfamily='arial')

    # Removing spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # Y axis values format
    ax.yaxis.set_major_formatter(FuncFormatter(thousandsFormat))

    # Saving the graph in an image directory
    ut.saveGraph(filename)

### Multiple Histogram Graph

This function creates a figure containing multiple histograms based on the numeric columns of a given DataFrame.

In [None]:
def multiHist(filename, df, bins=50, kde=True, bar_color='#329D9C', kde_color='#472F7D'):

    # Select numeric columns from DataFrame
    df = df.select_dtypes(include='number')

    # Set up subplots and figure dynamically based on the number of columns
    numCols = len(df.columns)

    # Add 1 to round up in case of an odd number of columns
    numRows = (numCols + 1) // 2

    height = ceil(numCols/2)*3
    figsize = (15, height)

    fig, axes = plt.subplots(nrows=numRows, ncols=2, figsize=figsize)

    # Adjust the spacing between subplots
    fig.subplots_adjust(hspace=0.5)

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    # Plot histograms for each column
    for i, column in enumerate(df.columns):

        # Assigning graph ax
        ax = axes[i]

        # Define histogram and KDE line design
        sns.histplot(df[column], bins=bins, color=bar_color, alpha=1, kde=kde, line_kws={'lw': 1.5}, ax=ax)  #type: ignore
        ax.lines[0].set_color(kde_color)

        # Define grid design and disposition
        ax.grid(True, color='grey', linewidth='0.5', axis='y', alpha=0.3)
        ax.set_axisbelow(True)

        # Remove X and Y axis labels
        ax.set_xlabel('')
        ax.set_ylabel('')

        # Graph title design
        ax.set_title(column, loc='center', pad=15, weight='bold', fontsize=12, fontfamily='arial')

        # Change the tick labels font and size
        ax.tick_params(axis='x', labelsize=8, labelfontfamily='arial')
        ax.tick_params(axis='y', labelsize=8, labelfontfamily='arial')

    # If there are an odd number of subplots, remove the empty one
    if numCols % 2 != 0:
        fig.delaxes(axes[-1])

    # Save the graph in an images directory
    ut.saveGraph(filename, tight_layout=False)