### Dependencies

In [3]:
# File Management libraries
import tarfile
import urllib.request
from pathlib import Path

# Data Analytics libraries
import matplotlib.pyplot as plt

# ML libraries
import numpy as np

### Download Data

This function downloads the Housing data zip file from a GitHub repository, create the 'data' directory if not exists and finally extracts the content from the zip file.

In [16]:
def downloadHousingData():

    # Defining the zip path and the csv path
    zipfilePath = Path('data/housing.tgz')
    housingPath = Path('data/housing.csv')

    # Checking if the zip file not exists
    if not zipfilePath.is_file():

        # Making the data/ directory
        Path('data').mkdir(parents=True, exist_ok=True)
        print('data/ directory created')

        # Setting the URL for downloading the data
        URL = 'https://github.com/ageron/data/raw/main/housing.tgz'

        # Downloading the data from the URL into the zip path
        urllib.request.urlretrieve(URL, zipfilePath)
        print(f'data downloaded from {URL}')

    # Checking if the csv file not exists
    if not housingPath.is_file():

        # Open the zip file and extract the content
        with tarfile.open(zipfilePath) as housing_zip:
            housing_zip.extractall(path='data')
        print('Content Extracted into data/')

        # Move the csv file to the data directory
        csvPath = Path('data/housing/housing.csv')
        csvPath.rename(Path('data/housing.csv'))

        # Remove the housing/ directory from data/
        housingDirPath = Path('data/housing')
        housingDirPath.rmdir()

### Save Graph

This function downloads a graph generated by matplotlib with the specified name, graph extension and resolution.

In [None]:
def saveGraph(graph_id:str, tight_layout=True, graph_extension='png', resolution=300):

    # Creating the image directory if not exists
    imagesPath = Path('images')
    imagesPath.mkdir(parents=True, exist_ok=True)

    # Defining the path for saving the graphs
    graphPath = imagesPath/f'{graph_id}.{graph_extension}'

    # Checking if the tight_layout is True
    if tight_layout:
        plt.tight_layout()
    
    # Saving the graph with the name, format and dpi resolution
    plt.savefig(graphPath, format=graph_extension, dpi=resolution)

### Random Data Splitter

This function splits a dataframe randomly into a test and train set for ML models.

In [None]:
def randomDataSplitter(data, test_ratio=0.2):

    # Creates the list of indexes and calculates the number of rows for the test set
    indexList = np.random.permutation(len(data))
    testSize = int(len(data) * test_ratio)

    # Creates the test and train list of indexes based on the test size
    testIndexes = indexList[:testSize]
    trainIndexes = indexList[testSize:]

    # Creates the test and train sets based on the original dataframe
    testSet = data.iloc[testIndexes]
    trainSet = data.iloc[trainIndexes]

    return trainSet, testSet

In [None]:
np.random.randint()