In [18]:
import numpy as np
import glob
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import scipy.stats as stats
from scipy.stats import wilcoxon
import ast

In [19]:
#fileNames2D = np.sort(glob.glob("./data/zheng/processed_interactions_p4/*"))
fileNames2D = np.sort(glob.glob("./data/zheng/processed_interactions_p4/*"))

In [20]:
fileNames2D

array(['./data/zheng/processed_interactions_p4/pro13_ace_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro15_ade_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro17_ace_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro18_adf_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro20_ade_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro21_ade_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro22_ade_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro23_acf_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro24_adf_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro25_acf_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro29_ace_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro31_adf_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro3_ace_p4_logs.csv',
       './data/zheng/processed_interactions_p4/pro5_adf_p4_logs.csv',
       '

In [21]:
def get_action_counts(dataframe, current_phase):
    actions = ['same', 'modify-1', 'modify-2', 'modify-3']

    # Initialize counts array with zeros for all actions
    action_counts = {action: 0 for action in actions}

    # Count occurrences of each action
    for action in actions:
        count = dataframe[dataframe['Action'] == action].shape[0]
        if count is not None:  # Update count if action exists
            action_counts[action] = count

    # Convert dictionary to list, ensuring length is exactly 4
    action_counts_list = [action_counts[action] for action in actions]

    return action_counts_list


In [22]:
class StationarityTests:
    def __init__(self, significance=.05):
        self.SignificanceLevel = significance
        self.pValue = None
        self.isStationary = None
        self.trend= None

    def Chi_Sqaure_Test(self,series_X, series_Y):

        return stats.chisquare(f_obs=series_X, f_exp=series_Y)


In [23]:
def get_user_name(url):
    parts = url.split('/')
    fname = parts[-1]
    uname = fname.rstrip('_log.csv')
    return uname

In [24]:
import pandas as pd
import numpy as np
import warnings

# Initialize an empty list to store print information
print_info = []

# Define a function to split the DataFrame into four equal parts with equal lengths
def split_dataframe(df,splits=4):
    num_rows = len(df)
    quarter = num_rows // splits
    remainder = num_rows % splits  # Calculate the remainder based on four quarters
    if remainder != 0:
        df = df[:-remainder]  # Drop the extra rows to make the length divisible by 4
    slices = []
    start = 0
    for i in range(splits):
        end = start + quarter
        slices.append(df[start:end].reset_index(drop=True))
        start = end
    return slices



# Suppress FutureWarning related to DataFrame concatenation
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    all_pvalue = []
    test_dfs = pd.DataFrame(columns=['User', 'First_Quarter', 'Second_Quarter', 'Third_Quarter', 'Fourth_Quarter', 'p-value'])
    for i, u in enumerate(fileNames2D):
        state = get_user_name(u)
        print(f"\nProcessing test for action: {state}")
        df = pd.read_csv(u)
        slices = split_dataframe(df, 2)

        # Get counts for each slice
        probabs = []
        for j, slice_df in enumerate(slices):
            probab = get_action_counts(slice_df, state)
            print(f"Unnormalized Quarter {j + 1}: {probab}")
            probabs.append(probab)

        # Perform statistical tests
        stats_test = StationarityTests()
        results = []
        for j in range(len(probabs) - 1):
            result = stats_test.Chi_Sqaure_Test(probabs[j], probabs[j + 1])
            results.append(result)
            all_pvalue.append(result.pvalue)

       # Create DataFrame for the test results
        user_df = pd.DataFrame({'User': [u]})
        user_df = user_df.assign(**{f'Quarter_{i+1}': str(probab) for i, probab in enumerate(probabs)})
        user_df['p-value'] = results[-1].pvalue
        test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)

        test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)

        # Append the print information to the list
        print_info.append((state, probabs, [result.pvalue < 0.05 for result in results]))
        all_pvalue.append(result.pvalue)

# Create a DataFrame from the list of print information
df_print_info = pd.DataFrame(print_info, columns=['Action', 'Counts', 'Stationary'])



Processing test for action: pro13_ace_p4
Unnormalized Quarter 1: [17, 22, 3, 2]
Unnormalized Quarter 2: [29, 11, 2, 2]

Processing test for action: pro15_ade_p4
Unnormalized Quarter 1: [14, 23, 7, 0]
Unnormalized Quarter 2: [11, 20, 13, 0]

Processing test for action: pro17_ace_p4
Unnormalized Quarter 1: [54, 25, 11, 21]
Unnormalized Quarter 2: [66, 19, 13, 13]

Processing test for action: pro18_adf_p4
Unnormalized Quarter 1: [16, 17, 0, 0]
Unnormalized Quarter 2: [6, 27, 0, 0]

Processing test for action: pro20_ade_p4
Unnormalized Quarter 1: [53, 25, 19, 1]
Unnormalized Quarter 2: [50, 25, 18, 5]

Processing test for action: pro21_ade_p4
Unnormalized Quarter 1: [56, 31, 23, 10]
Unnormalized Quarter 2: [67, 19, 20, 14]

Processing test for action: pro22_ade_p4
Unnormalized Quarter 1: [36, 15, 6, 1]
Unnormalized Quarter 2: [39, 3, 9, 7]

Processing test for action: pro23_acf_p4
Unnormalized Quarter 1: [20, 15, 2, 7]
Unnormalized Quarter 2: [34, 1, 4, 5]

Processing test for action: pro

In [25]:
df_print_info

Unnamed: 0,Action,Counts,Stationary
0,pro13_ace_p4,"[[17, 22, 3, 2], [29, 11, 2, 2]]",[True]
1,pro15_ade_p4,"[[14, 23, 7, 0], [11, 20, 13, 0]]",[False]
2,pro17_ace_p4,"[[54, 25, 11, 21], [66, 19, 13, 13]]",[True]
3,pro18_adf_p4,"[[16, 17, 0, 0], [6, 27, 0, 0]]",[False]
4,pro20_ade_p4,"[[53, 25, 19, 1], [50, 25, 18, 5]]",[False]
5,pro21_ade_p4,"[[56, 31, 23, 10], [67, 19, 20, 14]]",[True]
6,pro22_ade_p4,"[[36, 15, 6, 1], [39, 3, 9, 7]]",[True]
7,pro23_acf_p4,"[[20, 15, 2, 7], [34, 1, 4, 5]]",[True]
8,pro24_adf_p4,"[[29, 7, 3, 4], [17, 11, 7, 8]]",[True]
9,pro25_acf_p4,"[[28, 15, 6, 9], [35, 10, 4, 9]]",[False]


In [26]:
# Calculate the total number of True values inside the "Stationary" arrays
total_stationary_values = df_print_info['Stationary'].apply(lambda x: sum(x)).sum()

# Calculate the total number of values in the "Stationary" arrays
total_values = df_print_info['Stationary'].apply(len).sum()

# Calculate the ratio
ratio_stationary = total_stationary_values / total_values

# Print the result
print(ratio_stationary)


0.6111111111111112


In [27]:
from statsmodels.stats.multitest import multipletests
a=multipletests(all_pvalue,alpha=0.05,method='bonferroni')

In [28]:
a[0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True, False,
       False,  True,  True, False, False, False, False, False, False])

In [29]:
#count the True result in a[0]
np.sum(a[0])/len(a[0])

0.25