In [26]:
import numpy as np
import glob
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from scipy.stats import wilcoxon
import ast

In [27]:
fileNames2D = np.sort(glob.glob("./data/zheng/processed_interactions_p4_bookmarked/*"))

In [28]:
def get_probabilities_action(dataframe, current_phase):
    actions = ['same', 'modify-1', 'modify-2', 'modify-3']

    # Initialize a dictionary with small default probabilities to avoid division by zero
    probabilities = dict.fromkeys(actions, 0.000001)

    # Count occurrences of each action
    action_counts = dataframe['Action'].value_counts()

    # Update probabilities based on counts
    for action in actions:
        if action in action_counts:
            probabilities[action] = action_counts[action] / len(dataframe)

    return probabilities[current_phase]

In [29]:
def get_probabilities(dataframe,current_phase):
    map = ['Title', 'US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget', 'Release_Date',
                           'MPAA_Rating', 'Running_Time_min', 'Distributor', 'Source', 'Major_Genre', 'Creative_Type',
                           'Director', 'Rotten_Tomatoes_Rating', 'IMDB_Rating', 'IMDB_Votes', 'None']
    dict_map = dict.fromkeys(map, 0.00001)
    for index, row in dataframe.iterrows():
        for field in ast.literal_eval(row['Attribute']):
            dict_map[field] += 1
    return dict_map[current_phase]

In [30]:
class StationarityTests:
    def __init__(self, significance=.05):
        self.SignificanceLevel = significance
        self.pValue = None
        self.isStationary = None
        self.trend= None

    def Wilcoxon_Test(self,series_X, series_Y):

        return wilcoxon(series_X, series_Y)


In [31]:

all_pvalue=[]
# for state in ['Title', 'US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget', 'Release_Date',
#                            'MPAA_Rating', 'Running_Time_min', 'Distributor', 'Source', 'Major_Genre', 'Creative_Type',
#                            'Director', 'Rotten_Tomatoes_Rating', 'IMDB_Rating', 'IMDB_Votes', 'None']:
for state in ['same','modify-1','modify-2','modify-3']:
    print(f"\nProcessing test for state: {state}")
     # Create an empty DataFrame to store test data
    test_dfs = pd.DataFrame(columns=['User', 'First_Half', 'Second_Half'])
    # Iterate through file names
    for i, u in enumerate(fileNames2D):
        # Read CSV file
        df = pd.read_csv(u)

        # Extract user information from file path


        #print(f"\nProcessing data for user: {u}")

        # Split the DataFrame into two halves
        mid = round(len(df) / 2)
        slice1 = df[:mid].reset_index(drop=True)
        slice2 = df[mid:].reset_index(drop=True)
        #print('length of slice',len(slice1), len(slice2))

        # Calculate Navigation probabilities for each half
        probab_1 = get_probabilities_action(slice1,state)
        probab_2 = get_probabilities_action(slice2,state)

        #print(f"Probabilities for {u} - First Half: {probab_1}, Second Half: {probab_2}")

        # Create a DataFrame with user, first_half, and second_half probabilities
        user_df = pd.DataFrame({'User': [u], 'First_Half': [probab_1], 'Second_Half': [probab_2]})

        # Concatenate the result to the test_dfs DataFrame for all users
        test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)

    # Perform Wilcoxon signed-rank test on the probabilities of the two halves
    stats_test = StationarityTests()
    result = stats_test.Wilcoxon_Test(test_dfs['First_Half'], test_dfs['Second_Half'])
    #print(result)
    del test_dfs

    print(f"State: {state} , Probabilities: {probab_1} {probab_2} , Users are Non-stationary {result.pvalue<0.05} with p-value {result.pvalue}")
    all_pvalue.append(result.pvalue)



Processing test for state: same
State: same , Probabilities: 0.78125 0.782608695652174 , Users are Non-stationary False with p-value 0.31723979237390443

Processing test for state: modify-1
State: modify-1 , Probabilities: 0.10625 0.12422360248447205 , Users are Non-stationary False with p-value 0.0690511620712648

Processing test for state: modify-2


  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)


State: modify-2 , Probabilities: 0.10625 0.062111801242236024 , Users are Non-stationary False with p-value 0.4886806017769275

Processing test for state: modify-3
State: modify-3 , Probabilities: 0.00625 0.031055900621118012 , Users are Non-stationary False with p-value 0.537722120370485


  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)


In [21]:
from scipy import stats
stats.false_discovery_control(all_pvalue,method='bh')

array([0.65472085, 0.65472085, 0.65472085, 0.65472085])

In [32]:
from statsmodels.stats.multitest import multipletests
multipletests(all_pvalue,alpha=0.05,method='bonferroni')

(array([False, False, False, False]),
 array([1.        , 0.27620465, 1.        , 1.        ]),
 0.012741455098566168,
 0.0125)

In [33]:
test_dfs

NameError: name 'test_dfs' is not defined