# Python

About python

In [1]:
# importing module
import requests
import datetime, time

def download_file(src, dest):
    """
Function to dowload file(s) to a specified location
Returns a dictionary object stating details like source, destination, started date time, end date time and timetaken.
Arguments:
src - Location from which file needs to be downloaded
dest - Location at which file needs to be saved
    """
    started_on = datetime.datetime.now()
    start_time = time.perf_counter()
    status = None
    description = None
    try:
        response = requests.get(src)
        open(f"{dest}", "wb").write(response.content)
        status = "Success"
    except Exception as e:
        status = "Fail"
        description = str(e)

    end_time = time.perf_counter()
    ended_on = datetime.datetime.now()

    return {
        "source": src,
        "destination": dest,
        "started_on": started_on,
        "completed_on": ended_on,
        "time_taken": (end_time - start_time),
        "status": status,
        "description": description
    }

## Logging
> Tracking events i.e. errors, exceptions, warnings, informations etc.

In [10]:
# importing module
import logging

# Test messages
logging.debug("Harmless debug Message")
logging.info("Just an information")
logging.warning("Its a Warning")
logging.error("Did you try to divide by zero")
logging.critical("Internet is down")

ERROR:root:Did you try to divide by zero
CRITICAL:root:Internet is down


In [11]:
#logging.basicConfig(format='%(asctime)s %(levelname)-10s %(message)s')

console = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] %(name)-12s: %(message)s', datefmt='%d-%b-%Y %H:%M:%S')
console.setFormatter(formatter)

logger = logging.getLogger("Python Language")

logger.handlers.clear()

logger.addHandler(console)

logger.debug("Harmless debug Message")
logger.info("Just an information")
logger.warning("Its a Warning")
logger.error("Did you try to divide by zero")
logger.critical("Internet is down")

21-Jul-2025 10:50:56 [ERROR   ] Python Language: Did you try to divide by zero
ERROR:Python Language:Did you try to divide by zero
21-Jul-2025 10:50:56 [CRITICAL] Python Language: Internet is down
CRITICAL:Python Language:Internet is down


In [12]:
LIST_SIZE = 50
URLs = ["https://picsum.photos/3000/3000" for i in range(LIST_SIZE)]
PATHS = [f"files/output/images/img_{i}.jpeg" for i in range(LIST_SIZE)]

## Threading and Multiprocessing
 - Techniques to achive concurrency and parallelism
 - running and managing the multiple computations at the same time.
 - deals lot of things simultaneously.
 - debugging is very hard

### Concurrency
 - achieved through the interleaving operation of processes on the central processing unit(CPU) or in other words by the context switching
 - can be done by using a single processing unit
 - increases the amount of work finished at a time
 - non-deterministic control flow approach

### Parallelism
 - achieved by through multiple central processing units(CPUs)
 - needs multiple processing units
 - improves the throughput and computational speed of the system
 - deterministic control flow approach

### Threading
 - Implements the Concurrency
 - IO-bound tasks

In [6]:
import threading
from concurrent.futures import ThreadPoolExecutor

def simple_threading():
    t1 = threading.Thread(target=download_file, args=[URLs[0], "files/thread_1.jpeg"])
    t2 = threading.Thread(target=download_file, args=[URLs[0], "files/thread_2.jpeg"])
    t3 = threading.Thread(target=download_file, args=[URLs[0], "files/thread_3.jpeg"])

    t1.start()
    t2.start()
    t3.start()

    t1.join()
    t2.join()
    t3.join()

def thread_pooling():
    with ThreadPoolExecutor() as executor:
        results = executor.map(download_file, URLs[0:3], PATHS[0:3])
        for result in results:
            logger.info(result)

In [12]:
logger.setLevel(logging.INFO)
thread_pooling()

24-Aug-2023 04:59:03 [INFO    ] root        : {'source': 'https://picsum.photos/3000/3000', 'destination': 'files/img_0.jpeg', 'started_on': datetime.datetime(2023, 8, 24, 4, 59, 0, 593273), 'completed_on': datetime.datetime(2023, 8, 24, 4, 59, 3, 102773), 'time_taken': 2.509490514999925, 'status': 'Success', 'description': None}
24-Aug-2023 04:59:04 [INFO    ] root        : {'source': 'https://picsum.photos/3000/3000', 'destination': 'files/img_1.jpeg', 'started_on': datetime.datetime(2023, 8, 24, 4, 59, 0, 595235), 'completed_on': datetime.datetime(2023, 8, 24, 4, 59, 4, 443243), 'time_taken': 3.8479987010000514, 'status': 'Success', 'description': None}
24-Aug-2023 04:59:04 [INFO    ] root        : {'source': 'https://picsum.photos/3000/3000', 'destination': 'files/img_2.jpeg', 'started_on': datetime.datetime(2023, 8, 24, 4, 59, 0, 597096), 'completed_on': datetime.datetime(2023, 8, 24, 4, 59, 4, 177603), 'time_taken': 3.5804985459999443, 'status': 'Success', 'description': None}


### Multiprocessing
 - Implements the Parallelism
 - CPU bound Tasks

In [None]:
import multiprocessing
from concurrent.futures import ProcessPoolExecutor

def simple_processes():
    t1 = multiprocessing.Process(target=downloadFile, args=[URLs[0], "files/process_1.jpeg"])
    t2 = multiprocessing.Process(target=downloadFile, args=[URLs[0], "files/process_2.jpeg"])
    t3 = multiprocessing.Process(target=downloadFile, args=[URLs[0], "files/process_3.jpeg"])

    t1.start()
    t2.start()
    t3.start()

    t1.join()
    t2.join()
    t3.join()

def multi_process():
    process_list = []
    for i in range(LIST_SIZE):
        p = multiprocessing.Process(target=downloadFile, args=[URLs[i], PATHS[i]])
        p.start()
        process_list.append(p)
    for p in process_list:
        p.join()

def process_pool():
    with ProcessPoolExecutor() as executor:
        results = executor.map(downloadFile, URLs, PATHS)
        for result in results:
            print(result)

# Numpy
- Open source and fundamental package or library for **scientific computing** for Python
- Contains multidimensional array and matrix data structures
- Core of the scientific Python and PyData ecosystems
- Universal standard for working with numerical data in Python
- NumPy arrays are **faster** and more **compact** than Python lists
- The core of the NumPy package is the **ndarray** object
- Operations on arrays, including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation
- used extensively in Pandas, SciPy, Matplotlib, scikit-learn, scikit-image and most other data science and scientific Python packages

## Reason for Fast
1. Vectorization
    - describes the absence of any explicit looping, indexing, etc.
    - more concise and easier to read
    - fewer lines of code generally means fewer bugs
    - closely resembles standard mathematical notation
2. Broadcasting
    - describe the implicit element-by-element behavior of operations
    - the smaller array is “broadcast” across the larger array so that they have compatible shapes

In [1]:
import numpy as np

In [2]:
import logging

console = logging.StreamHandler()
#formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] [%(processName)s] %(name)s: %(message)s', datefmt='%d-%b-%Y %H:%M:%S')
formatter = logging.Formatter('[%(levelname)-8s]: %(message)s')
console.setFormatter(formatter)

logger = logging.getLogger("Numpy Library")

logger.handlers.clear()

logger.addHandler(console)

logger.setLevel(logging.INFO)

## Numpy Fundamentals
Concepts, design decisions, and technical constraints in NumPy.

### Array Creation
There are 6 general mechanisms for creating arrays:

1. Conversion from other Python structures (i.e. lists and tuples)
2. Intrinsic NumPy array creation functions (e.g. arange, ones, zeros, etc.)
3. Replicating, joining, or mutating existing arrays
4. Reading arrays from disk, either from standard or custom formats
5. Creating arrays from raw bytes through the use of strings or buffers
6. Use of special library functions (e.g., random)

**Note:-** 
1. Naming Conventions:
    1. \*1D\* : One Dimension Array
    2. \*2D\* : Two Dimension Array
    3. \*3D\* : Three Dimension Array
    4. \*nD\* : n Dimension Array
2. A

In [None]:
# TODO Add defination for the functions used in single line for basic understanding

In [3]:
# Conversion from other Python structures (i.e. lists and tuples)
l1D = np.array([1, 2, 3, 4])
l2D = np.array([[1, 2], [3, 4]])
l3D = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

t1D = np.array((1, 2, 3, 4), dtype=np.int8)

# Intrinsic NumPy array creation functions (e.g. arange, ones, zeros, etc.)
## arrange function: 
a1D_1 = np.arange(10)
a1D_2 = np.arange(2, 10, dtype=float)
a1D_3 = np.arange(2, 3, 0.1)
## linspace function: 
l1d_4 = np.linspace(1., 10., 6)

## eye function: 
e2D_1 = np.eye(3)
e2D_2 = np.eye(3, 2)

## diag function: 
d2D_1 = np.diag([1, 2, 3])
d2D_2 = np.diag([1, 2, 3], 1)
d2D_3 = np.diag(l2D)

## vander function: 
v2D_1 = np.vander([1, 2, 3, 4], 2)
v2D_2 = np.vander((1, 2, 3, 4), 4)
v2D_3 = np.vander(np.linspace(0, 2, 5), 2)

## zeros function:
z3D_1 = np.zeros((2, 3))
z3D_1 = np.zeros((2, 3, 2))

## ones function:
o3D_1 = np.ones((2, 3))
o3D_1 = np.ones((2, 3, 2))

## numpy.random.default_rng().random() function:
dr3D_1 = np.random.default_rng(42).random((2, 3))
dr3D_1 = np.random.default_rng(42).random((2, 3, 2))

## indices function:
n3D_1 = np.indices((3,3))

# Replicating, joining, or mutating existing arrays
## function:
## function:

# Reading arrays from disk, either from standard or custom formats
## loadtxt function:

# Creating arrays from raw bytes through the use of strings or buffers
## fromfile function:

In [4]:
# TODO Call all the variables using logger
logger.info(f"a1D_1: {a1D_1}")
logger.info(f"a1D_2: {a1D_2}")
logger.info(f"e2D_1: {e2D_1}")

[INFO    ]: a1D_1: [0 1 2 3 4 5 6 7 8 9]
[INFO    ]: a1D_2: [2. 3. 4. 5. 6. 7. 8. 9.]
[INFO    ]: e2D_1: [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


### Indexing

In [None]:
# code

### I/O Operations

In [None]:
# code

### Data Types
| **Numpy type**             | **C type**                                                        | **Description**                                                                                  |
|----------------------------|-------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
| numpy.bool_                | bool                                                              | Boolean (True or False) stored as a byte                                                         |
| numpy.byte                 | signed char                                                       | Platform-defined                                                                                 |
| numpy.ubyte                | unsigned char                                                     | Platform-defined                                                                                 |
| numpy.short                | short                                                             | Platform-defined                                                                                 |
| numpy.ushort               | unsigned short                                                    | Platform-defined                                                                                 |
| numpy.intc                 | int                                                               | Platform-defined                                                                                 |
| numpy.uintc                | unsigned int                                                      | Platform-defined                                                                                 |
| numpy.int_                 | long                                                              | Platform-defined                                                                                 |
| numpy.uint                 | unsigned long                                                     | Platform-defined                                                                                 |
| numpy.longlong             | long long                                                         | Platform-defined                                                                                 |
| numpy.ulonglong            | unsigned long long                                                | Platform-defined                                                                                 |
| numpy.half / numpy.float16| | Half precision float: sign bit, 5 bits exponent, 10 bits mantissa |
| numpy.single               | float                                                             | Platform-defined single precision float: typically sign bit, 8 bits exponent, 23 bits mantissa   |
| numpy.double               | double                                                            | Platform-defined double precision float: typically sign bit, 11 bits exponent, 52 bits mantissa. |
| numpy.longdouble           | long double                                                       | Platform-defined extended-precision float                                                        |
| numpy.csingle              | float complex                                                     | Complex number, represented by two single-precision floats (real and imaginary components)       |
| numpy.cdouble              | double complex                                                    | Complex number, represented by two double-precision floats (real and imaginary components).      |
| numpy.clongdouble          | long double complex                                               | Complex number, represented by two extended-precision floats (real and imaginary components).    |

In [None]:
# code

### Broadcasting

In [None]:
#code

### Copies and Views

In [None]:
#code 

### Structured arrays

In [None]:
# code

### Universal functions (ufunc) basic 

In [None]:
# code

# Pandas
- data manipulation package for tabular data
- used throughout the data analysis workflow
- data can be imported from databases, spreadsheets, comma-separated values (CSV) files, and more.
- High performance merging and joining of data
- Data alignment and integrated handling of missing data

In [None]:
import pandas as pd

In [None]:
import logging

console = logging.StreamHandler()
#formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] [%(processName)s] %(name)s: %(message)s', datefmt='%d-%b-%Y %H:%M:%S')
formatter = logging.Formatter('[%(levelname)-8s]: %(message)s')
console.setFormatter(formatter)

logger = logging.getLogger("Pandas Library")

logger.handlers.clear()

logger.addHandler(console)

logger.setLevel(logging.INFO)

In [None]:
## Dataframes

# Practise Exercises

In [63]:
import csv
import datetime
import json
import logging
import os
import requests
import time
import numpy as np
import pandas as pd
import traceback
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from zipfile import ZipFile
from io import BytesIO

In [2]:
console = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] [%(processName)s] %(name)s: %(message)s', \
                              datefmt='%d-%b-%Y %H:%M:%S')
console.setFormatter(formatter)

logger = logging.getLogger("Python Exercises")

logger.handlers.clear()

logger.addHandler(console)

logger.setLevel(logging.INFO)

In [37]:
def get_file_paths(src, extension = None, mindepth = 1, maxdepth = float('inf')):
    rootdir = os.path.normcase(src)
    file_paths = []
    root_depth = rootdir.rstrip(os.path.sep).count(os.path.sep) - 1
    for dirpath, dirs, files in os.walk(rootdir):
        depth = dirpath.count(os.path.sep) - root_depth
        if mindepth <= depth <= maxdepth:
            for filename in files:
                if extension is None:
                    file_paths.append(os.path.join(dirpath, filename))
                if os.path.splitext(filename)[1] == extension:
                    file_paths.append(os.path.join(dirpath, filename))
        elif depth > maxdepth:
            del dirs[:] 
    return file_paths

In [13]:
def get_file_names(paths):
    urls = []
    if type(paths) is str:
        urls.append(paths)
    if type(paths) is list:
        urls = paths
    
    file_names = [url.split(os.path.sep)[-1] for url in urls]
    
    return file_names

#### Exercise 1 - Downloading Files

**Objective: -** Downlaod files using the below urls and Unzip those files

Urls:-
```
   "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2018_Q4.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q1.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q2.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q3.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q4.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2020_Q1.zip",
    "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2220_Q1.zip",
```

In [7]:
def download_and_unzip_file(src, dest):
    log = {
        "source": src,
        "destination": dest,
        "started_on": datetime.datetime.now(),
        "completed_on": None,
        "time_taken": -1,
        "status": "Failed",
        "description": "UnKnown"
    }
    start = time.perf_counter()
    # noinspection PyBroadException
    try:
        response = requests.get(src)
        ZipFile(BytesIO(response.content)).extractall(dest)
        log["status"] = "Successful".upper()
        log["description"] = "Successfully extracted files"
    except Exception:
        log["status"] = "Failed".upper()
        log["description"] = traceback.format_exc()
    
    end = time.perf_counter()
    log["time_taken"] = (end - start)
    log["completed_on"] = datetime.datetime.now()
    
    return log

In [8]:
def unzip_files(urls):
    paths = ["files/output/url_extracted/" for _ in range(len(urls))]
    with ProcessPoolExecutor() as executor:
        results = executor.map(download_and_unzip_file, urls, paths)
        for result in results:
            logger.info(result)

In [9]:
def execute_download():
    urls = ["https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2018_Q4.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q1.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q2.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q3.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q4.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2020_Q1.zip",
            "https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2220_Q1.zip"]
    unzip_files(urls)
    pass

In [10]:
# execute_download()

28-Aug-2023 13:02:19 [INFO    ] [MainProcess] Python Exercises: {'source': 'https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2018_Q4.zip', 'destination': 'files/output/url_extracted/', 'started_on': datetime.datetime(2023, 8, 28, 13, 2, 10, 946903), 'completed_on': datetime.datetime(2023, 8, 28, 13, 2, 19, 883040), 'time_taken': 8.936083748999977, 'status': 'SUCCESSFUL', 'description': 'Successfully extracted files'}
28-Aug-2023 13:02:19 [INFO    ] [MainProcess] Python Exercises: {'source': 'https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q1.zip', 'destination': 'files/output/url_extracted/', 'started_on': datetime.datetime(2023, 8, 28, 13, 2, 10, 951345), 'completed_on': datetime.datetime(2023, 8, 28, 13, 2, 13, 317890), 'time_taken': 2.3664954299999863, 'status': 'SUCCESSFUL', 'description': 'Successfully extracted files'}
28-Aug-2023 13:02:19 [INFO    ] [MainProcess] Python Exercises: {'source': 'https://divvy-tripdata.s3.amazonaws.com/Divvy_Trips_2019_Q2.zip', 'destinat

#### Exercise 2 - Scraping

**Objective: -** You need to download a file of weather data from a government website.
files that are sitting at the following specified location.

https://www.ncei.noaa.gov/data/local-climatological-data/access/2021/

You are looking for the file that was `Last Modified` on `2022-02-07 14:03`, you
can't cheat and lookup the file number yourself. You must use Python to scrape
this webpage, finding the corresponding file-name for this timestamp, `2022-02-07 14:03`

Once you have obtained the correct file, and downloaded it, you must load the file
into `Pandas` and find the record(s) with the highest `HourlyDryBulbTemperature`.
Print these record(s) to the command line.

Generally, your script should do the following ...
1. Attempt to web scrap/pull down the contents of `https://www.ncei.noaa.gov/data/local-climatological-data/access/2021/`
2. Analyze it's structure, determine how to find the corresponding file to `2022-02-07 14:03` using Python.
3. Build the `URL` required to download this file, and write the file locally.
4. Open the file with `Pandas` and find the records with the highest `HourlyDryBulbTemperature`.
5. Print this to stdout/command line/terminal.

In [None]:
def scrape_weather_page():
    url = "https://www.ncei.noaa.gov/data/local-climatological-data/access/2021/"

#### Exercise 3 - Convert JSON to CSV + Ragged Directories

**Objective: -** Your task is two use `Python` to find all the `json` files located in the `files/input/json` folder.
Once you find them all, read them with `Python` and convert them to `csv` files, to do this
you will have to flatten out some of the nested `json` data structures.

For example there is a `{"type":"Point","coordinates":[-99.9,16.88333]}` that must flattened.

Generally, your script should do the following ...
1. Crawl the `files/input/json` directory with `Python` and identify all the `json` files.
2. Load all the `json` files.
3. Flatten out the `json` data structure.
4. Write the results to a `csv` file, one for one with the json file, including the header names.

In [5]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [77]:
def process_json_to_csv(src, dest):
    log = {
        "source": src,
        "destination": dest,
        "started_on": datetime.datetime.now(),
        "completed_on": None,
        "time_taken": -1,
        "status": "Failed",
        "description": "UnKnown"
    }
    start = time.perf_counter()
    # noinspection PyBroadException
    try:
        # code to read JSON, flatten the JSON data, and write as CSV
        with open(os.path.normcase(src)) as json_file:
            json_data = json.load(json_file)
            futures = []
            with ThreadPoolExecutor() as executor:
                for s_json_data in json_data:
                    futures.append(executor.submit(flatten_json, s_json_data))
                results = [r.result() for r in concurrent.futures.as_completed(futures)]
                with open(dest, 'w', newline='\n', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=results[0].keys())
                    writer.writeheader()
                    writer.writerows(results)
        log["status"] = "Successful".upper()
        log["description"] = "Successfully converted to CSV from JSON"
    except Exception:
        log["status"] = "Failed".upper()
        log["description"] = traceback.format_exc()
    
    end = time.perf_counter()
    log["time_taken"] = (end - start)
    log["completed_on"] = datetime.datetime.now()
    
    return log

In [75]:
def convert_json_to_csv():
    file_paths = get_file_paths("files/input/jsons", extension=".json")
    #file_names = get_file_names(file_paths)
    output_file_paths = ["files/output/csvs/" + file_name + ".csv" for file_name in get_file_names(file_paths)]
    with ProcessPoolExecutor() as executor:
        results = executor.map(process_json_to_csv, file_paths, output_file_paths)
        for result in results:
            logger.info(result)

In [78]:
convert_json_to_csv()

<class 'dict'><class 'dict'>  22

<class 'dict'> 2


#### Exercise 4 - Data Modeling

**Objective: -** There are also
3 `csv` files located in 'files/input/csvs' folder. Open each one and examine it, the 
first task is to create a `sql` script with the `DDL` to hold
a `CREATE` statement for each data file. Remember to think about data types. 
Also, this `CREATE` statements should include indexes for each table, as well
as primary and foreign keys.

After you have finished this `sql` scripts, we must connect to `Postgres` using the `Python` package
called `psycopg2`. Once connected we will run our `sql` scripts against the database.

Finally, we will use `psycopg2` to insert the data in each `csv` file into the table you created.

Generally, your script should do the following ...
1. Examine each `csv` file in `files/input/csvs` folder. Design a `CREATE` statement for each file.
2. Ensure you have indexes, primary and forgein keys.
3. Use `psycopg2` to connect to `Postgres` on `localhost` and the default `port`.
4. Create the tables against the database.
5. Ingest the `csv` files into the tables you created, also using `psycopg2`.

In [None]:
# inspect data types for the given files
def inspect_data_types():
    pass

In [None]:
CREATE TABLE abc (
    id bigint
)

In [None]:
def write_data_to_db():
    pass

#### Exercise 5 - Numpy - Analyzing Exam Scores

**Objective: -** Important NumPy topics i.e. array creation, broadcasting, statistical calculations, and indexing.

In this exercise:

1. create a NumPy array of exam scores.
1. calculate the mean and standard deviation of the scores.
1. use boolean indexing to identify students who scored above the mean.
1. calculate the percentage of students who scored above the mean.
1. replace the lowest score with the mean score.
1. calculate the score range using the difference between the maximum and minimum scores.

In [5]:
def exam_score_analysis():
    # create a array
    scores = np.array([85, 92, 78, 88, 95, 90, 82, 70, 92, 87])
    
    # 1. Calculate the mean and standard deviation of the scores
    mean_score = np.mean(scores)
    std_deviation = np.std(scores)
    
    # 2. Identify students who scored above the mean score
    above_mean = scores > mean_score

    # 3. Calculate the percentage of students who scored above the mean
    above_mean_percentage = np.mean(above_mean) * 100

    # 4. Replace the lowest score with the mean score
    lowest_index = np.argmin(scores)
    scores[lowest_index] = mean_score

    # 5. Calculate the score range (maximum - minimum)
    score_range = np.max(scores) - np.min(scores)
    
    logger.info("Mean Score: %f", mean_score)
    logger.info("Standard Deviation:  %f", std_deviation)
    logger.info("Percentage Above Mean:  %f", above_mean_percentage)
    logger.info("Updated Scores:  %s", scores)
    logger.info("Score Range:  %f", score_range)

In [11]:
# exam_score_analysis()

28-Aug-2023 13:02:35 [INFO    ] [MainProcess] Python Exercises: Mean Score: 85.900000
28-Aug-2023 13:02:35 [INFO    ] [MainProcess] Python Exercises: Standard Deviation:  7.147727
28-Aug-2023 13:02:35 [INFO    ] [MainProcess] Python Exercises: Percentage Above Mean:  60.000000
28-Aug-2023 13:02:35 [INFO    ] [MainProcess] Python Exercises: Updated Scores:  [85 92 78 88 95 90 82 85 92 87]
28-Aug-2023 13:02:35 [INFO    ] [MainProcess] Python Exercises: Score Range:  17.000000


#### Exercise 6 - Pandas

**Objective: -** 

- DataFrame Basic Properties
    1. No. of observations
    2. No. of variables(columns)
    3. No. of missing values
- Cleaning Data
    4. Rename columns having space with `_`
    5. Create Columns
        1. 
    6. Remove Column
    7. Treat Missing Data
- Filtering Data
    8.  
- Calculating From Data 
    9. 
- Grouping Data
    10. 
- Others

In [None]:
def data_analysis():
    # Create Dataframe
    df = pd.read_csv("files/input/csvs/sales/*")
    
    logger.info(df.info())

    pass

#### Exercise 7 - Data Visualization

**Objective: -** 