# Synthetic Log File Generator

This program will generate a synthetic log file 


## Functions Related to Random Number Generation

In [1]:
from functools import partial 
import numpy as np
import random
from scipy.stats import truncnorm
import matplotlib.pyplot as plt

# Create a group of random number with size count
def randints(count , *randint_args):
    ri= partial(random.randint, *randint_args)
    return [ri() for _ in range(count)]


# Get the cdf (cumulative distribution function) of n event and select one of them based on their probabilities
def random_item_selection(cdf):
    x = random.uniform(0, 1)
    i=0
    while i<len(cdf):
        if cdf[i]>x:
            break
        i+=1
    return i
        

# Create a random integer number based on a guassian mixture distribution 
#  - gaussian_selection_probability determines the probability of selection each gaussian distribution
#  - mu and sd are array of means and standard deviation of each gaussian distribution
#  - positive is a True/False value which determines if the result should be positive or not
def gaussian_mixture_randint(mu,sd,gaussian_selection_probability,positive):
    i=random_item_selection(gaussian_selection_probability)    
    result= int(np.random.normal(mu[i], sd[i]))
    
    if positive and result<=0:
        return 1
    else:    
        return result
    
# Assign probability to n different variable (satisfying sum-to-one and positivity condition) and return the cdf 
def random_probablity(n):
    list_of_random_float=np.random.random(n)
    sum_value=list_of_random_float.sum()
    normalized_values=list_of_random_float/sum_value
    for i in range(1,len(normalized_values)):
        normalized_values[i]=normalized_values[i]+normalized_values[i-1]
    return normalized_values

 
def truncated_normal_rand(mu,sd,a,b):
    # Calculate the normalized lower and upper bounds
    lower = (a - mu) / sd
    upper = (b - mu) / sd

    # Generate a random variable based on truncated normal distribution
    random_var = truncnorm.rvs(lower, upper, loc=mu, scale=sd)
    
    return random_var

def plot_truncated_normal(mu,sd,a,b,plt_title):
    # Calculate the normalized lower and upper bounds
    lower = (a - mu) / sd
    upper = (b - mu) / sd 
    
    # Create a truncated normal distribution object
    dist = truncnorm(lower, upper, loc=mu, scale=sd)

    # Generate a range of x values to plot
    x = np.linspace(a, b, 100)

    # Plot the truncated normal distribution
    plt.plot(x, dist.pdf(x))

    # Add axis labels and a title to the plot
    plt.xlabel('x')
    plt.ylabel('PDF')
    plt.title(plt_title)

    # Show the plot
    plt.show()


## Functions for Pages Usability Features and Average Completion Time Assignment
Assign usability features randomly to each page in the sitemap, based on their predetermined distributions.
- In this program time unit is hour 
- For usability feature a truncate normal distribution will be used that for most of them the value of variable is truncated between 0 and 100 


In [2]:
def plot_distribution_of_usability_feature():
    plot_truncated_normal(50,10,0,100,'FLEXIBILITY')
    plot_truncated_normal(50,10,0,100,'USER EXPERIENCE')
    plot_truncated_normal(70,10,0,100,'LEGIBILITY')
    plot_truncated_normal(45,10,0,100,'GROUPING BY FORMAT')
    plot_truncated_normal(75,10,0,100,'GROUPING BY LOCATION')
    plot_truncated_normal(50,10,0,100,'SIGNIFICANCE OF CODE')
    plot_truncated_normal(60,10,0,100,'CONSISTENCY')
    plot_truncated_normal(50,10,0,100,'IMMEDIATE FEEDBACK')
    plot_truncated_normal(60,10,0,100,'EXPLICIT USER ACTION')
    plot_truncated_normal(80,10,0,100,'USER CONTROL')
    plot_truncated_normal(80,10,0,100,'PROMPTING')
    plot_truncated_normal(80,10,0,100,'ERROR PROTECTION')
    plot_truncated_normal(90,10,0,100,'QUALITY OF ERROR MESSAGE') 
    plot_truncated_normal(80,10,0,100,'ERROR CORRECTION')
    plot_truncated_normal(50,20,0,100,'INFORMATION DENSITY')
    plot_truncated_normal(75,10,0,100,'MINIMAL ACTION')
    plot_truncated_normal(80,10,0,100,'CONCISENESS')
    plot_truncated_normal(3,5,1,20,'Time')

def assign_page_featurs():

    #Random variable assignment for each variable in the highest level of BN 
    
    flexibility = truncated_normal_rand(50,10,0,100)
    user_experience = truncated_normal_rand(50,10,0,100)
    legibility = truncated_normal_rand(70,10,0,100)
    grouping_by_format = truncated_normal_rand(45,10,0,100)
    grouping_by_location = truncated_normal_rand(75,10,0,100)
    significance_of_code = truncated_normal_rand(50,10,0,100)
    consistency = truncated_normal_rand(60,10,0,100)
    immediate_feedback = truncated_normal_rand(50,10,0,100)
    explicit_user_action = truncated_normal_rand(60,10,0,100)
    user_control = truncated_normal_rand(80,10,0,100)
    prompting  = truncated_normal_rand(80,10,0,100)
    error_protection  = truncated_normal_rand(80,10,0,100)
    quality_of_error_message  = truncated_normal_rand(90,10,0,100)
    error_correction  = truncated_normal_rand(80,10,0,100)
    information_density  = truncated_normal_rand(50,20,0,100)
    minimal_action  = truncated_normal_rand(75,10,0,100)
    conciseness  = truncated_normal_rand(80,10,0,100)
    time = truncated_normal_rand(3,5,1,20)*0.016
    
    page_features=[flexibility,user_experience,legibility,grouping_by_format,grouping_by_location,significance_of_code,
                   consistency,immediate_feedback,explicit_user_action,user_control,prompting,error_protection,
                   quality_of_error_message,error_correction,information_density,minimal_action,conciseness,time]
    return page_features


def setup_website_featurs(pageList):
    web_feature={}
    for p in pageList:
        web_feature[p]=assign_page_featurs()
    return web_feature
        


# Fuction for determining wrong path
This functions determine the wrong path should be added to the path if an incorrect path is selected by user.
visited_wrong_pages will takes the structure of website, current page and the corrected page and will go through a wong path if there is any and continue this path until it finally halt or back to the correct path. 

- The probability of halt will increased based on the length of the path in wrong direction and based on usability features of web page.

- The probability of back to the correct page will impacted based on the length of path in wrong direction and usability features of web page.

- The probability of continueing wrong path or halting will decrease based on users' history of usage.

The function action_in_wrong_path will determine the if it should continue the wrong path. back to the previous page or halt based on the above features. 

In [3]:
def action_probability_in_wrong_path(page, path_len, user):
    # path_len: toole masiri ast ke dar masir eshtebah tey shode.
    
    # baraye nazdiki be vagheiat bayad ehtemal ra tori dar nazar begirim ke ba afzayeshe toole masir ehtemal halt afzayes peyda konad 
    # dar avaye masir ehtemal back bishtar bashad
        # r=select a random uniform number
        # if r <1/3 :
        #    state ="continueError"
        # elif r<2/3
        #    state = "backToCorrect"
        # else 
        # state= "halt"
    return state

def visited_wrong_pages(sitemap,sourcepage,correctpage, user):
    time=0
    logpart_df=pd.Dataframe()
    logpart_df.columns= ['time','visited_page']
    in_error=1
    page =#select a page that follows sourcepage and is not correct page
    logpart_df = df.append({'time':time,'visited_page':page})
    path_len=1
    while in_error:
        state=action_probability_in_wrong_path(page, path_len)
        if state=="continueError":
            page= #select a page that connects to the previous page in site map
            if page == sourcepage :
                in_error=0
            logpart_df = df.append({'time':page_time+variation_time,'visited_page':page})
        elif state = "backToCorrect":
            reverse_viseted = logpart_df(visited_page).reverse
            for page in reverse_viseted
            #reversily add all pages in the logpart_df
            
            in_error=0
            break;
        else
            
    
    

SyntaxError: invalid syntax (2781841427.py, line 20)

## 1. Set Initial Values 

In [4]:

# Set the mean and standard deviation for the number of roles
mu_role_number = 10
sd_role_number = 3


# Set the parameters of gaussian mixture model
mu=[2,50,1000]
sd=[5,20,500]
p=[0.2,0.8,1]

# Set the number of tasks
mu_n_task=30
sd_n_task=10

# Set the mu and sd of number of pages in each task
mu_pages=7
sd_pages=3

# Set the starting time 
pre_time=0

#Set number of session for collecting log file
n_session=2

# Set the mu and sd of number of task performance in each session
mu_task_per_session=1
sd_task_per_session=2
min_task_per_session=0
max_task_per_session=20

# Set the exponential distribution parameter for determiing the time between entrance of users to the system.
lambda_time=2



## 2. Generate Random Sitemap, Task Models and Page Names

Generate a random sitemap based on number of tasks.

In [5]:
from Sitemap import SiteMap
import numpy as np
import random

# Determine number of tasks
n_task= int(np.random.normal(mu_n_task, sd_n_task))
if n_task <=0:
    n_task=1
    
# Generate a random site map based on number of tasks and distribution of number of pages in each task
sitemap=SiteMap()
sitemap.generate_random_sitemap(n_task,mu_pages,sd_pages)

# Get the list of all pages in website
pageList= sitemap.get_all_pages()

# Get total number of pages 
n_page= sitemap.get_num_pages()

# Get the list of pages in each task
taskList=sitemap.get_task_list()

# Get the usability feature of each page in the web site
#web_feature=setup_website_featurs(pageList)
#plot_distribution_of_usability_feature()

# 3. System's Roles Set up

Randomly select the number of roles, the number of users associated with each role, and the activity frequency of each role in the system, with predefined distributions.
- select number of role based on mu_role_number and sd_role_number
- select number of users per role based on mu_user_number and sd_user_number
- select the activity frequency of each role randomly
- Determine the distribution of performing each task by each role

In [6]:
from functools import partial 


# Generate a random number of roles
num_roles = int(np.random.normal(mu_role_number, sd_role_number))
if num_roles <=0:
    num_roles=1


# Generate a random number of users per role based on a gaussian mixture model
num_user_in_task=[0]*num_roles
for i in range(num_roles):
    num_user_in_task[i]=gaussian_mixture_randint(mu,sd,p,True)

# Initialize history of usage for each user which include the number of performing each task by each user and the last date of usage
n_user=sum(num_user_in_task)
task_performance_history = [[("", 0) for _ in range(n_task)] for _ in range(n_user)]
visited_page_history = [[("", 0) for _ in range(n_page)] for _ in range(n_user)] 

# Generate a random activity frequency for each role
role_activity_frequencies = random_probablity(num_roles)

# Print the results
#print(f"Number of roles: {num_roles}")
#print(f"Number of users per role: {num_user}")
#print(f"Activity frequencies: {role_activity_frequencies}")

# Determine the distribution of performing each task by each role
task_distribution=[]
for i in range(num_roles):
    task_distribution.append(random_probablity(n_task))

## 7.	Role,User, Time and Task Configuration
Randomly select a role based on its activity frequency in the system.
Randomly select a user from the users assigned to the selected role.
Randomly select a task for the user, based on the distribution of task performance for the user's role.

The Exponential Distribution is the time between events in a Poisson process. Simply, it is an inverse of Poisson. If the number of occurrences follows a Poisson distribution, the lapse of time between these events is distributed exponentially. It is used to model items with a constant failure rate. (the Poisson distribution deals with the number of occurrences in a fixed period of time, and the exponential distribution deals with the time between occurrences of successive events as time flows by continuously.)  The distribution has one parameter, λ which is assumed to be the average rate of arrivals or occurrences of an event in a given time interval.

In [7]:
import pandas as pd

columns= ['user_id','time','visited_page','task_id']
log_df = pd.DataFrame(columns=columns)

for se in range(n_session):
    #===print("session:",se)
    # determine number of task in current session
    n_task_in_session= int(truncated_normal_rand(mu_task_per_session,sd_task_per_session,min_task_per_session,max_task_per_session))
    
    # Generate a random time between arrivals using the exponential distribution
    time_between_arrivals = random.expovariate(lambda_time)
    start_time=pre_time+time_between_arrivals
    pre_time= start_time

    # Randomly select a role based on its activity frequency in the system.
    selected_role=random_item_selection(role_activity_frequencies)
     #print(task_distribution[selected_role])

    # Determine user indexes for each role
    user_index=[0]*(num_roles+1)
    for i in range(num_roles):
        user_index[i+1]=user_index[i]+num_user_in_task[i]

    # Randomly select a user in detemined role
    selected_userid= random.randint(user_index[selected_role],user_index[selected_role+1])
    
    for t in range(n_task_in_session):
        #===print ("    task:",t)
        # Randomly select a task for the user, based on the distribution of task performance for the user's role
        selected_task=random_item_selection(task_distribution[selected_role])
        task=taskList[selected_task]
        
        #Set the time of requesting first page in the current task
        action_time=start_time
        
        i=0
        while (i<len(task)):
            
            # Determine the visited page
            page=task[i]
            
            # Record the user action in log file
            log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
                        
            # Determine the probability of user action based on history of usage and page features and number of visited this page
            # -- This part later will be replaced by taking a random value from each variable final distribution in the BN
            correct_path = truncated_normal_rand(50,30,0,100)
            back=truncated_normal_rand(50,30,0,100)
            loop=truncated_normal_rand(50,30,0,100)
            wrong_path=truncated_normal_rand(50,30,0,100)
            incomplete = truncated_normal_rand(50,30,0,100)
            time=truncated_normal_rand(3,8,0,20)
            
            # update next action time and history of visited current page
            pageIndex= pageList.index(page)
            previous_viseted=visited_page_history[selected_userid][pageIndex][1]
            visited_page_history[selected_userid][pageIndex]=(action_time,previous_viseted+1)
            action_time+=time
            
            # This assignment is for readability
            CORRECT_PATH=0
            BACK=1
            LOOP=2
            WRONG_PATH=3
            INCOMPLETE=4
            
            # Select users final action
            actions=[correct_path,back,loop,wrong_path,incomplete]
            sorted_actions_prob = sorted(actions, reverse=True)

            # Select the max probability
            max_action_prob = sorted_actions_prob[0]
            selected_action= actions.index(max_action_prob)
            
            if selected_action==BACK and i==0:
                # Select the Second max
                second_max_action_prob = sorted_actions_prob[1]
                selected_action= actions.index(second_max_action_prob)
            
            #===print("          page ",page, " action= ",selected_action)
            if selected_action==CORRECT_PATH:
                i+=1
            elif selected_action==BACK:
                i-=1
            elif selected_action==LOOP:
                pass
            elif selected_action==INCOMPLETE:
                break;
            elif selected_action==WRONG_PATH:
                print("Wrong patg should be completed")
                #visited_wrong_pages(sitemap,sourcepage,correctpage, user,visited_page_history)
      
        # update history of task performance        
        performance_num=task_performance_history[selected_userid][selected_task][1]
        task_performance_history[selected_userid][selected_task]=(start_time,performance_num+1)
        start_time=action_time


print(log_df)


  user_id       time visited_page task_id
0     860   0.195495   INZDMBEDnV      17
1    2578   0.391745   INZDMBEDnV      20
2    2578   1.736696   INZDMBEDnV       4
3    2578  13.654735   INZDMBEDnV      17
4    2578  27.007872   UJVQddTmJF      17
5    2578  30.983583   INZDMBEDnV      17
6    2578  38.263917   INZDMBEDnV      17
7    2578  43.985090   INZDMBEDnV      22


  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_userid,'time':action_time,'visited_page':page,'task_id':selected_task}, ignore_index=True)
  log_df = log_df.append({'user_id':selected_use

## 8. Determine the value of Bayesian Nodes
Feed the user and page information into the Bayesian network to determine the next move of the user

we will replace this part by randomly assign a value to the final nodes in the Bayesian network. these values later will be determined by BN. 



In [85]:
print(taskList[3])
print(taskList[30])

['udyniCVRqg', 'agjHkQfebU', 'zws5KMibK5', 'bNohKDB3ba', 'UoWcPuIwPE', '0D3jR6OCPG', '106E654UYE', 'ls5T596tvo']
['udyniCVRqg', 'ElGelSuyr3', 'FNV8nWxmt1', '6umKDMhijS', 'TYQabuFYFb', 'qq1RPqNSYy', 'X0M0ZClEkE', 'O2paKUXFLh', 'phX8yGbZMJ']


## 10. Create Dataframe:
- Add each row to the dataframe
- Sort by time
- replace user id by ip
- replace page name with URLs
- write into a file