In [2]:
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml

Load data

In [3]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [4]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [5]:
synonyms_df = pd.read_pickle("../resources/synonyms.pkl") 
synonyms_df

Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym,identifier
0,Infrastructure Human Resource Globalization,infrastructure human resource globalization,,,1
1,Change Strategy *Reconfigure Existing Resource...,change strategy reconfigure existing resources...,,,2
2,Organizational Performance,organizational performance,,organizational readiness,3
3,FirmPerf,firmperf,,,4
4,Usage Intention,usage intention,,,5
...,...,...,...,...,...
614,Negative Feedback,negative feedback,,,477
615,Information Technology Application *Codify Kno...,information technology application codify know...,,,478
616,Prior domain knowledge,prior domain knowledge,,explicit information expectations,124
617,Competencies,competencies,Competence,,479


In [6]:
identifier_df = synonyms_df.groupby('identifier').agg({
    'construct': lambda x: list(set(x)),  # List of unique constructs per group
}).reset_index()

with pd.option_context('display.max_colwidth', None):
  display(identifier_df)

Unnamed: 0,identifier,construct
0,1,[Infrastructure Human Resource Globalization]
1,2,[Change Strategy *Reconfigure Existing Resources *Acquire and Reconfigure Resources *Acquire Resources without Configuring *Business as Usual]
2,3,"[Action readiness, Organizational Readiness, Organizational Performance, Environmental Performance, Social Performance]"
3,4,[FirmPerf]
4,5,[Usage Intention]
...,...,...
474,476,[Telecom]
475,477,[Negative Feedback]
476,478,[Information Technology Application *Codify Knowledge *Create Network]
477,479,[Competencies]


In [7]:
type(identifier_df['construct'].loc[identifier_df.index[0]])

list

In [8]:
# Replace empty strings with NaN
hypotheses_df = hypotheses_df.replace('', pd.NA)

# Remove rows where either 'cause' or 'effect' is NaN (which now includes original empty strings)
hypotheses_df = hypotheses_df.dropna(subset=['cause', 'effect'])

In [9]:
# Explode the 'construct' list into separate rows
exploded_identifiers_df = identifier_df.explode('construct')

# Create a dictionary for mapping constructs to identifiers
construct_to_identifier = dict(zip(exploded_identifiers_df['construct'], exploded_identifiers_df['identifier']))

# Define a mapping function
def map_construct_to_identifier(construct):
    return construct_to_identifier.get(construct, None) # Returns None if construct not found

# Apply the mapping function to the 'cause' and 'effect' columns
hypotheses_df['cause_identifier'] = hypotheses_df['cause'].map(map_construct_to_identifier)
hypotheses_df['effect_identifier'] = hypotheses_df['effect'].map(map_construct_to_identifier)

In [10]:
hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
0,IT Mindfulness,Alertness to Distinction,394,74
1,IT Mindfulness,Awareness of Multiple Perspectives,394,473
2,IT Mindfulness,Openness to Novelty,394,397
3,IT Mindfulness,Orientation to the Present,394,267
4,IT Mindfulness,Continuance Intention,394,300
...,...,...,...,...
576,Perceived Usefulness,Attitude,304,241
577,Perceived Usefulness,Intention to Use,304,451
578,Perceived Ease-of-use,Attitude,372,241
579,Attitude,Intention to Use,241,451


In [11]:
nan_count = hypotheses_df.isna().any(axis=1).sum()
nan_count

0

In [12]:
"""
hypotheses_df                   A -> B
first_degree_hypotheses_df      B -> C
.
.
.
nth_degree_hypotheses_df        n -> m

repeat until the number of rows stablilizes (does not change / reduce any further)
"""

'\nhypotheses_df                   A -> B\nfirst_degree_hypotheses_df      B -> C\n.\n.\n.\nnth_degree_hypotheses_df        n -> m\n\nrepeat until the number of rows stablilizes (does not change / reduce any further)\n'

In [13]:
# Identify all unique effect_identifier values
effect_identifiers = set(hypotheses_df['effect_identifier'])

# Filter the hypotheses_df to include rows where the cause_identifier is in effect_identifiers
first_degree_hypotheses_df = hypotheses_df[hypotheses_df['cause_identifier'].isin(effect_identifiers)]
first_degree_hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
16,Perceived Ease of Use,Perceived Usefulness,275,304
17,Perceived Usefulness,Attitude Towards Using,304,179
18,Perceived Usefulness,Behavioral Intention to Use,304,179
19,Perceived Ease of Use,Attitude Towards Using,275,179
20,Attitude Towards Using,Behavioral Intention to Use,179,179
...,...,...,...,...
576,Perceived Usefulness,Attitude,304,241
577,Perceived Usefulness,Intention to Use,304,451
578,Perceived Ease-of-use,Attitude,372,241
579,Attitude,Intention to Use,241,451


In [14]:
# Identify all unique effect_identifier values
effect_identifiers = set(first_degree_hypotheses_df['effect_identifier'])

# Filter the hypotheses_df to include rows where the cause_identifier is in effect_identifiers
second_degree_hypotheses_df = first_degree_hypotheses_df[first_degree_hypotheses_df['cause_identifier'].isin(effect_identifiers)]
second_degree_hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
17,Perceived Usefulness,Attitude Towards Using,304,179
18,Perceived Usefulness,Behavioral Intention to Use,304,179
20,Attitude Towards Using,Behavioral Intention to Use,179,179
21,Behavioral Intention to Use,Actual System Use,179,147
27,Credibility,Price Premiums,91,469
...,...,...,...,...
572,Attitude towards website R2=0.74/R2=0.75,Future intention to visit R2=0.68/R2=0.69,115,353
576,Perceived Usefulness,Attitude,304,241
577,Perceived Usefulness,Intention to Use,304,451
579,Attitude,Intention to Use,241,451


In [15]:
# Identify all unique effect_identifier values
effect_identifiers = set(second_degree_hypotheses_df['effect_identifier'])

# Filter the hypotheses_df to include rows where the cause_identifier is in effect_identifiers
third_degree_hypotheses_df = second_degree_hypotheses_df[second_degree_hypotheses_df['cause_identifier'].isin(effect_identifiers)]
third_degree_hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
20,Attitude Towards Using,Behavioral Intention to Use,179,179
21,Behavioral Intention to Use,Actual System Use,179,147
27,Credibility,Price Premiums,91,469
38,Perceived accuracy (R2=0.461),Intentions to use RAs(R2=0.270),120,384
39,Perceived efford (R2=0.290),Intentions to use RAs(R2=0.270),365,384
...,...,...,...,...
570,Perceived usefulness R2=0.55/R2=0.57,Attitude towards website R2=0.74/R2=0.75,120,115
571,Perceived usefulness R2=0.55/R2=0.57,Future intention to visit R2=0.68/R2=0.69,120,353
572,Attitude towards website R2=0.74/R2=0.75,Future intention to visit R2=0.68/R2=0.69,115,353
579,Attitude,Intention to Use,241,451


In [16]:
# Identify all unique effect_identifier values
effect_identifiers = set(third_degree_hypotheses_df['effect_identifier'])

# Filter the hypotheses_df to include rows where the cause_identifier is in effect_identifiers
fourth_degree_hypotheses_df = third_degree_hypotheses_df[third_degree_hypotheses_df['cause_identifier'].isin(effect_identifiers)]
fourth_degree_hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
20,Attitude Towards Using,Behavioral Intention to Use,179,179
21,Behavioral Intention to Use,Actual System Use,179,147
27,Credibility,Price Premiums,91,469
62,Perceived System Performance,Perceived Usefullness,411,44
63,Perceived System Performance,Percieved Ease of Use,411,338
...,...,...,...,...
566,CSE,cse-C,20,20
567,Task-Oriented Communication During Action Epis...,Performance: - Effectiveness - Efficiency,62,130
572,Attitude towards website R2=0.74/R2=0.75,Future intention to visit R2=0.68/R2=0.69,115,353
579,Attitude,Intention to Use,241,451


In [17]:
# Identify all unique effect_identifier values
effect_identifiers = set(fourth_degree_hypotheses_df['effect_identifier'])

# Filter the hypotheses_df to include rows where the cause_identifier is in effect_identifiers
fifth_degree_hypotheses_df = fourth_degree_hypotheses_df[fourth_degree_hypotheses_df['cause_identifier'].isin(effect_identifiers)]
fifth_degree_hypotheses_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
20,Attitude Towards Using,Behavioral Intention to Use,179,179
21,Behavioral Intention to Use,Actual System Use,179,147
27,Credibility,Price Premiums,91,469
62,Perceived System Performance,Perceived Usefullness,411,44
63,Perceived System Performance,Percieved Ease of Use,411,338
...,...,...,...,...
565,CSE,cse-B,20,367
566,CSE,cse-C,20,20
567,Task-Oriented Communication During Action Epis...,Performance: - Effectiveness - Efficiency,62,130
579,Attitude,Intention to Use,241,451


In [18]:
fifth_degree_hypotheses_df.loc[fifth_degree_hypotheses_df['cause_identifier'] == 338]

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
66,Percieved Ease of Use,Perceived Usefullness,338,44
67,Percieved Ease of Use,Behavioral Intention,338,140


In [19]:
# Check the difference
difference_df = pd.merge(fourth_degree_hypotheses_df, fifth_degree_hypotheses_df, how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']

# Drop the merge indicator column
difference_df = difference_df.drop(columns=['_merge'])
difference_df

Unnamed: 0,cause,effect,cause_identifier,effect_identifier
90,4 Customer Value,Organizational Performance,189,3
169,Attitude towards website R2=0.74/R2=0.75,Future intention to visit R2=0.68/R2=0.69,115,353


Create a chain of Cause and Effect:
1. Start from the First DataFrame: Begin with a 'cause' from the first dataframe.

2. Iteratively Trace Paths Through Subsequent DataFrames: For each 'effect' in a dataframe, find it as a 'cause' in the next dataframe. Follow each possible path through all four dataframes.

3. Record Each Unique Path as a Separate Chain: As you trace through the dataframes, store each unique chain of cause and effect.

4. Create the New DataFrame with Individual Rows for Each Chain: Each unique chain will form a row in the new dataframe.

In [20]:
def trace_chain(current_chain, current_df_index, dataframes, all_chains):
    if current_df_index >= len(dataframes):
        all_chains.append(current_chain)  # Add the completed chain to all_chains
        return

    current_value = current_chain[-1]  # Get the last element in the current chain
    next_df = dataframes[current_df_index]

    # Find the next steps in the chain
    next_steps = next_df[next_df['cause_identifier'] == current_value]['effect_identifier'].tolist()

    if not next_steps:  # If there are no next steps, add the current chain to all_chains
        all_chains.append(current_chain)
    else:
        for next_step in next_steps:
            # Continue building the chain for each next step
            trace_chain(current_chain + [next_step], current_df_index + 1, dataframes, all_chains)

# Initialize dataframes
dataframes = [hypotheses_df, first_degree_hypotheses_df, second_degree_hypotheses_df, third_degree_hypotheses_df, fourth_degree_hypotheses_df, fifth_degree_hypotheses_df]

# Start the tracing process and store all chains
all_chains = []
for starting_point in hypotheses_df['cause_identifier'].unique():
    trace_chain([starting_point], 1, dataframes, all_chains)

# Convert the chains to a new dataframe
chain_df = pd.DataFrame({'Chain': all_chains})


In [21]:
def build_chains(current_chain, next_df_index, dataframes):
    if next_df_index == len(dataframes):
        return [current_chain]  # Return the completed chain

    chains = []
    last_effect = current_chain[-1]
    next_df = dataframes[next_df_index]

    # Find where the last effect is a cause in the next dataframe
    next_steps = next_df[next_df['cause_identifier'] == last_effect]['effect_identifier'].tolist()

    for step in next_steps:
        # Extend the chain and continue building
        extended_chain = current_chain + [step]
        chains.extend(build_chains(extended_chain, next_df_index + 1, dataframes))

    return chains

# Initialize dataframes
dataframes = [hypotheses_df, first_degree_hypotheses_df, second_degree_hypotheses_df, third_degree_hypotheses_df, fourth_degree_hypotheses_df, fifth_degree_hypotheses_df]

# Build all possible chains
all_chains = []
for starting_cause in hypotheses_df['cause_identifier'].unique():
    initial_effects = hypotheses_df[hypotheses_df['cause_identifier'] == starting_cause]['effect_identifier'].tolist()
    for effect in initial_effects:
        all_chains.extend(build_chains([starting_cause, effect], 1, dataframes))

# Convert the chains to a new dataframe
chain_df = pd.DataFrame({'Chain': all_chains})


In [22]:
chain = chain_df.drop_duplicates(subset=['Chain'])

In [23]:
def remove_loops(sequence):
    new_sequence = []
    loop_numbers = []
    last_seen = None

    for number in sequence:
        if number == last_seen:
            if number not in loop_numbers:
                loop_numbers.append(number)
        else:
            new_sequence.append(number)
            last_seen = number

    return new_sequence, loop_numbers

# Apply the function to each row in the dataframe
chain['Processed_Chain'], chain['Loop_Numbers'] = zip(*chain['Chain'].apply(remove_loops))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chain['Processed_Chain'], chain['Loop_Numbers'] = zip(*chain['Chain'].apply(remove_loops))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chain['Processed_Chain'], chain['Loop_Numbers'] = zip(*chain['Chain'].apply(remove_loops))


In [24]:
chain

Unnamed: 0,Chain,Processed_Chain,Loop_Numbers
0,"[83, 304, 179, 179, 179, 179, 179]","[83, 304, 179]",[179]
1,"[83, 304, 179, 179, 179, 179, 147]","[83, 304, 179, 147]",[179]
2,"[83, 304, 179, 179, 179, 179, 140]","[83, 304, 179, 140]",[179]
6,"[83, 275, 304, 179, 179, 179, 179]","[83, 275, 304, 179]",[179]
7,"[83, 275, 304, 179, 179, 179, 147]","[83, 275, 304, 179, 147]",[179]
...,...,...,...
7096,"[136, 20, 20, 20, 20, 20, 20]","[136, 20]",[20]
7097,"[136, 20, 20, 20, 20, 20, 367]","[136, 20, 367]",[20]
7159,"[372, 304, 179, 179, 179, 179, 179]","[372, 304, 179]",[179]
7160,"[372, 304, 179, 179, 179, 179, 147]","[372, 304, 179, 147]",[179]


In [25]:
# get unique loop numbers identifiers
distinct_loop_numbers = set()
for loop_list in chain['Loop_Numbers']:
    distinct_loop_numbers.update(loop_list)
    
distinct_loop_numbers

{15, 20, 45, 48, 60, 81, 91, 102, 143, 149, 158, 165, 167, 179, 222, 402}

In [26]:
# Create list for chain
#processed_chain = chain['Processed_Chain'].tolist()

# Remove chains that are just a single element
#filtered_chain = [lst for lst in processed_chain if len(lst) > 1]

In [27]:
# Remove chains that are just a single element
chain_filtered = chain[chain['Processed_Chain'].apply(len) > 1]
chain_filtered

Unnamed: 0,Chain,Processed_Chain,Loop_Numbers
0,"[83, 304, 179, 179, 179, 179, 179]","[83, 304, 179]",[179]
1,"[83, 304, 179, 179, 179, 179, 147]","[83, 304, 179, 147]",[179]
2,"[83, 304, 179, 179, 179, 179, 140]","[83, 304, 179, 140]",[179]
6,"[83, 275, 304, 179, 179, 179, 179]","[83, 275, 304, 179]",[179]
7,"[83, 275, 304, 179, 179, 179, 147]","[83, 275, 304, 179, 147]",[179]
...,...,...,...
7096,"[136, 20, 20, 20, 20, 20, 20]","[136, 20]",[20]
7097,"[136, 20, 20, 20, 20, 20, 367]","[136, 20, 367]",[20]
7159,"[372, 304, 179, 179, 179, 179, 179]","[372, 304, 179]",[179]
7160,"[372, 304, 179, 179, 179, 179, 147]","[372, 304, 179, 147]",[179]


In [28]:
#chain_filtered.to_pickle("../resources/cause_effect_chain.pkl")

In [29]:
#chain_filtered = pd.read_pickle("../resources/cause_effect_chain.pkl") 


Create list of indirect relationships (skipping at least 1 element)
A -> B -> C --> A -> C

In [30]:
# Function to find indirect relationships
def find_indirect_relationships(chain):
    relationships = []
    for i in range(len(chain)):
        for j in range(i + 2, len(chain)):
            indirect_cause = chain[i]
            indirect_effect = chain[j]
            distance = j - i
            relationships.append((indirect_cause, indirect_effect, distance))
    return relationships

# Process each chain to find indirect relationships
indirect_relationships = []
for processed_chain in chain_filtered['Processed_Chain']:
    indirect_relationships.extend(find_indirect_relationships(processed_chain))

# Create new dataframe
indirect_df = pd.DataFrame(indirect_relationships, columns=['indirect_cause', 'indirect_effect', 'distance'])
indirect_df

Unnamed: 0,indirect_cause,indirect_effect,distance
0,83,179,2
1,83,179,2
2,83,147,3
3,304,147,2
4,83,179,2
...,...,...,...
54874,372,147,3
54875,304,147,2
54876,372,179,2
54877,372,140,3


In [31]:
#hypotheses_df.loc[hypotheses_df['cause_identifier'] == 304]

In [32]:
# Drop duplicates
clean_indirect_df = indirect_df.drop_duplicates()
clean_indirect_df

Unnamed: 0,indirect_cause,indirect_effect,distance
0,83,179,2
2,83,147,3
3,304,147,2
5,83,140,3
6,304,140,2
...,...,...,...
54843,92,97,5
54844,92,42,6
54859,463,42,6
54870,136,351,2


In [33]:
# Sort by distance
sorted_indirect_df = clean_indirect_df.sort_values(by='distance')
sorted_indirect_df

Unnamed: 0,indirect_cause,indirect_effect,distance
53238,191,9,2
53241,191,469,2
2482,121,411,2
84,3,398,2
25,275,147,2
...,...,...,...
239,13,140,6
54844,92,42,6
209,13,44,6
54814,92,97,6


In [34]:
#Prepare mapping
identifier_to_constructs = dict(zip(identifier_df['identifier'], identifier_df['construct']))

# Mapping function
def map_identifier_to_constructs(identifier):
    return identifier_to_constructs.get(identifier, [])  # Returns an empty list if identifier not found

# Step 3: Apply the mapping function
sorted_indirect_df['indirect_cause_construct'] = sorted_indirect_df['indirect_cause'].map(map_identifier_to_constructs)
sorted_indirect_df['indirect_effect_construct'] = sorted_indirect_df['indirect_effect'].map(map_identifier_to_constructs)


In [35]:
with pd.option_context('display.max_colwidth', None):
  display(sorted_indirect_df)

Unnamed: 0,indirect_cause,indirect_effect,distance,indirect_cause_construct,indirect_effect_construct
53238,191,9,2,[Facilitating Conditions],[Expectations Disconfirmation]
53241,191,469,2,[Facilitating Conditions],[Price Premiums]
2482,121,411,2,[Abysmal credibility text comments],"[Perceived System Performance, Perceived Performance]"
84,3,398,2,"[Action readiness, Organizational Readiness, Organizational Performance, Environmental Performance, Social Performance]",[Location Disclosure on LB-SNA]
25,275,147,2,"[Perceived ease of use, Perceived Ease of Use]",[Actual System Use]
...,...,...,...,...,...
239,13,140,6,[Outstanding credibility text comments],"[Behavioral Intent, Behavioral Intention]"
54844,92,42,6,[Risk Responses with Mitigation Effects],"[Medication Adherence, Medication k, Medication i]"
209,13,44,6,[Outstanding credibility text comments],"[Perceived Usefullness, Perceived Benefit, Perceived Enjoyment]"
54814,92,97,6,[Risk Responses with Mitigation Effects],"[Preventive Behavior, Health Promoting Behavior]"


In [36]:
#sorted_indirect_df.to_pickle("../resources/indirect_relations.pkl")

In [38]:
#sorted_indirect_df = pd.read_pickle("../resources/indirect_relations.pkl") 

In [37]:
# Count the number of occurrences per distance
distance_counts = sorted_indirect_df['distance'].value_counts().sort_index()
distance_counts

distance
2     308
3     424
4     573
5     778
6    1007
Name: count, dtype: int64