In [1]:
import numpy as np
import glob
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from scipy.stats import wilcoxon
import ast

In [2]:
fileNames2D = np.sort(glob.glob("./data/zheng/processed_interactions/*"))[:5]

In [3]:
def get_probabilities_action(dataframe,current_phase):
    map = ['same','modify-x','modify-y','modify-z','modify-x-y','modify-y-z','modify-x-z','modify-x-y-z']
    dict_map = dict.fromkeys(map, 0.000001)
    for action in dataframe['Action']:
        dict_map[action] += 1
    return dict_map[current_phase]


In [4]:
def get_probabilities(dataframe,current_phase):
    map = ['Title', 'US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget', 'Release_Date',
                           'MPAA_Rating', 'Running_Time_min', 'Distributor', 'Source', 'Major_Genre', 'Creative_Type',
                           'Director', 'Rotten_Tomatoes_Rating', 'IMDB_Rating', 'IMDB_Votes', 'None']
    dict_map = dict.fromkeys(map, 0.00001)
    for index, row in dataframe.iterrows():
        for field in ast.literal_eval(row['Attribute']):
            dict_map[field] += 1
    return dict_map[current_phase]

In [5]:
class StationarityTests:
    def __init__(self, significance=.05):
        self.SignificanceLevel = significance
        self.pValue = None
        self.isStationary = None
        self.trend= None

    def Wilcoxon_Test(self,series_X, series_Y):

        return wilcoxon(series_X, series_Y)


In [6]:

all_pvalue=[]
for state in ['Title', 'US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget', 'Release_Date',
                           'MPAA_Rating', 'Running_Time_min', 'Distributor', 'Source', 'Major_Genre', 'Creative_Type',
                           'Director', 'Rotten_Tomatoes_Rating', 'IMDB_Rating', 'IMDB_Votes', 'None']:
    print(f"\nProcessing test for state: {state}")
     # Create an empty DataFrame to store test data
    test_dfs = pd.DataFrame(columns=['User', 'First_Half', 'Second_Half'])
    # Iterate through file names
    for i, u in enumerate(fileNames2D):
        # Read CSV file
        df = pd.read_csv(u)

        # Extract user information from file path


        #print(f"\nProcessing data for user: {u}")

        # Split the DataFrame into two halves
        mid = round(len(df) / 2)
        slice1 = df[:mid].reset_index(drop=True)
        slice2 = df[mid:].reset_index(drop=True)
        #print('length of slice',len(slice1), len(slice2))

        # Calculate Navigation probabilities for each half
        probab_1 = get_probabilities(slice1,state)
        probab_2 = get_probabilities(slice2,state)

        #print(f"Probabilities for {u} - First Half: {probab_1}, Second Half: {probab_2}")

        # Create a DataFrame with user, first_half, and second_half probabilities
        user_df = pd.DataFrame({'User': [u], 'First_Half': [probab_1], 'Second_Half': [probab_2]})

        # Concatenate the result to the test_dfs DataFrame for all users
        test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)

    # Perform Wilcoxon signed-rank test on the probabilities of the two halves
    stats_test = StationarityTests()
    result = stats_test.Wilcoxon_Test(test_dfs['First_Half'], test_dfs['Second_Half'])
    #print(result)
    del test_dfs

    print(f"State: {state} , Probabilities: {probab_1} {probab_2} , Users are Non-stationary {result.pvalue<0.05} with p-value {result.pvalue}")
    all_pvalue.append(result.pvalue)



Processing test for state: Title
State: Title , Probabilities: 3.00001 1e-05 , Users are Non-stationary False with p-value 0.5807121621890252

Processing test for state: US_Gross
State: US_Gross , Probabilities: 4.00001 6.00001 , Users are Non-stationary False with p-value 0.8125

Processing test for state: Worldwide_Gross
State: Worldwide_Gross , Probabilities: 14.00001 1.00001 , Users are Non-stationary False with p-value 0.0625

Processing test for state: US_DVD_Sales
State: US_DVD_Sales , Probabilities: 8.00001 10.00001 , Users are Non-stationary False with p-value 1.0

Processing test for state: Production_Budget
State: Production_Budget , Probabilities: 10.00001 20.00001 , Users are Non-stationary False with p-value 0.06788915486182899

Processing test for state: Release_Date
State: Release_Date , Probabilities: 24.00001 12.00001 , Users are Non-stationary False with p-value 0.8125

Processing test for state: MPAA_Rating
State: MPAA_Rating , Probabilities: 16.00001 22.00001 , Us

  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)


State: Creative_Type , Probabilities: 7.00001 7.00001 , Users are Non-stationary False with p-value 0.17971249487899976

Processing test for state: Director
State: Director , Probabilities: 1e-05 5.00001 , Users are Non-stationary False with p-value 0.0625

Processing test for state: Rotten_Tomatoes_Rating
State: Rotten_Tomatoes_Rating , Probabilities: 1e-05 1.00001 , Users are Non-stationary False with p-value 1.0

Processing test for state: IMDB_Rating
State: IMDB_Rating , Probabilities: 3.00001 1.00001 , Users are Non-stationary False with p-value 1.0

Processing test for state: IMDB_Votes
State: IMDB_Votes , Probabilities: 1.00001 1e-05 , Users are Non-stationary False with p-value 0.17971249487899976

Processing test for state: None
State: None , Probabilities: 106.00001 83.00001 , Users are Non-stationary False with p-value 0.14412703481601533


  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)
  test_dfs = pd.concat([test_dfs, user_df], ignore_index=True)


In [6]:
from scipy import stats
stats.false_discovery_control(all_pvalue,method='bh')

array([0.50192072, 0.55888545, 0.50072108, 0.50072108, 0.50072108,
       0.50072108, 0.41618218, 0.93591497])

In [112]:
from statsmodels.stats.multitest import multipletests
multipletests(all_pvalue,alpha=0.05,method='bonferroni')

(array([False,  True, False]),
 array([1.        , 0.00813675, 0.07185173]),
 0.016952427508441503,
 0.016666666666666666)

In [8]:
test_dfs

NameError: name 'test_dfs' is not defined