In [75]:
from data_gen import generate_data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np



In [76]:
def extract_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Extract useful information from a given CloudFront DataFrame.

    Args:
        dataframe (pd.DataFrame): The DataFrame to extract information from. It should have columns 'date', 'time',
        'x_edge_location', 'c_ip', 'cs_method', 'sc_status', and 'cs_uri_stem'.

    Returns:
        pd.DataFrame: A new DataFrame containing only the useful information extracted from the input DataFrame. The
        new DataFrame has columns 'datetime', 'x_edge_location', 'c_ip', 'cs_method', 'sc_status', and 'cs_uri_stem'.
    """
    # merge date and time columns
    dataframe['datetime'] = pd.to_datetime(dataframe['date'] + ' ' + dataframe['time'])

    # drop the original date and time columns
    temp_df = dataframe.drop(['date', 'time'], axis=1, errors='ignore')

    # set datetime column as first column
    temp_df.insert(0, 'datetime', temp_df.pop('datetime'))

    # Extract useful information from df
    df = dataframe.loc[:, ['datetime', 'x_edge_location', 'c_ip', 'cs_method', 'sc_status', 'cs_uri_stem']]

    return df

In [102]:
def find_unique_addresses(dataframe: pd.DataFrame) -> np.ndarray:
    unique_ips = dataframe['c_ip'].unique()
    return unique_ips

def count_visits_per_ip(dataframe: pd.DataFrame) -> pd.core.series.Series:
    """ 
    Returns the number of visits per IP address in the given DataFrame 
    """
    ip_counts = dataframe['c_ip'].value_counts()
    return ip_counts

In [104]:
full_df = generate_data()
df = extract_data(full_df)
unique_ips = find_unique_addresses(df)
count_visits_per_ip(df)

175.250.140.142    2
208.158.157.69     2
147.67.132.25      2
88.167.246.73      2
184.183.125.210    2
                  ..
167.23.98.2        1
107.175.36.171     1
36.89.79.91        1
179.121.176.0      1
144.229.237.126    1
Name: c_ip, Length: 769, dtype: int64