In [2]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import re


def read_csv_process_time(file):
    df = pd.read_csv(file, header=None)
    df.columns = ['time', 'info', 'type']    
    df = df[df.iloc[:, 2] == 'access']
    
    df['time'] = pd.to_datetime(df['time'])

    # print(df)
    return df


data = read_csv_process_time("action_data.csv")




In [3]:
def get_base_url(url):
    
    pattern = r"^(https?://[^/]+)"
    match = re.match(pattern, url, re.IGNORECASE)
    
    if match:
        return match.group(1)
    else:
        return None
    
    
data["info"] = data["info"].apply(lambda x: get_base_url(x))
    
    
# print(data["info"].apply(lambda x: get_base_url(x)))



In [4]:

past_site = ""
change_type = []

for i, row in data.iterrows():
    current_site = row['info']
    
    if current_site != past_site:
        change_type.append("domain_change")
    else:
        change_type.append("route_change")
    
    past_site = current_site


data['change_type'] = change_type


# print(data["change_type"])

In [5]:
data['time_delta'] = data['time'].diff().dt.total_seconds().fillna(0)

# print(data)

In [6]:

domain_names = data['info'].unique()
transition_matrix = pd.DataFrame(index=domain_names, columns=domain_names)
transition_matrix = transition_matrix.fillna(0)


  transition_matrix = transition_matrix.fillna(0)


In [7]:
domain_changes = data[data['change_type'] == 'domain_change'] 
# print(domain_changes)
past_row = ""

for i, row in domain_changes.iterrows():
    current_row = row['info']
    
    # Let the loop get both a past and current site, before it adds to the transition matrix
    if i > 0:
        transition_matrix.loc[current_row, past_row] += 1
    
    past_row = current_row
    





In [8]:
def sample_next_site(current_site, transition_matrix):
    
    probabilities = transition_matrix.loc[current_site]
    probabilities = probabilities / probabilities.sum()
    next_sites = probabilities.index
    next_site = np.random.choice(next_sites, p=probabilities)

    return next_site

sample_next_site("HTTPS://CHATGPT.COM", transition_matrix)


'HTTPS://DRIVE.GOOGLE.COM'

In [9]:



def domain_change_times_for_website(website):
    
    domain_changes = data[data['change_type'] == 'domain_change'] 
    domain_changes['time_delta'] = domain_changes['time'].diff().dt.total_seconds().fillna(0)
    domain_changes['time_delta'] = domain_changes['time_delta'].shift(-1)
    domain_changes = domain_changes.dropna()
    domain_changes_df = domain_changes[domain_changes['info'] == website]
    domain_changes_time = list(domain_changes_df['time_delta'])
    return domain_changes_time






In [10]:
def route_change_times_for_website(website):
    
    # Create the deltas for all domains and all route changes
    route_change_data = data.copy()
    route_change_data['time_delta'] = route_change_data['time'].diff().dt.total_seconds().fillna(0)
    route_change_data['time_delta'] = route_change_data['time_delta'].shift(-1)
    
    # Getting all the domain changes and the route 
    route_changes_df = route_change_data[route_change_data['info'] == website].reset_index()
    
    # print(route_changes_df)
    
    past_row = []
    rows_to_filter = []
    for i, row in route_changes_df.iterrows():
        # print(i)
        if i > 0:
            if row['change_type'] == 'route_change' and past_row['change_type'] == 'domain_change':
                
                rows_to_filter.append(i - 1)
        past_row = row
    
    
    # print(rows_to_filter)
    route_changes_df = route_changes_df.drop(rows_to_filter).reset_index(drop=True)
    
    
    
    # print(route_changes_df)
    
    return route_changes_df['time_delta']


k = route_change_times_for_website("HTTPS://CHATGPT.COM")

print(k)






0      2001.406
1       163.685
2       611.623
3       639.407
4       169.787
5      5911.518
6      1694.049
7       286.539
8       757.774
9     16774.105
10       50.537
Name: time_delta, dtype: float64


In [17]:
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

# Sample list of numbers
data = [1, 2, 2.5, 3, 3.5, 4, 5, 6, 7]

# Perform Kernel Density Estimation
kde = gaussian_kde(data)

# Generate random samples based on KDE
num_samples = 1000
samples = kde.resample(num_samples).flatten()

# Plot the original data and the KDE-based samples
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, alpha=0.5, label='Original Data', density=True)
plt.hist(samples, bins=30, alpha=0.5, label='KDE Samples', density=True)
plt.plot(np.linspace(min(data), max(data), 1000), kde(np.linspace(min(data), max(data), 1000)), label='KDE', color='red')
plt.legend()
plt.title('Original Data and KDE-based Samples')
plt.show()

ModuleNotFoundError: No module named 'tensorflow'