In [1]:
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml

c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


Load data

In [2]:
def load_yaml_data(folder_path):
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            # Extract diagram number from file_name
            diagram_number = file_name.split('.')[0].replace('diagram', '')

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    # Add diagram number to each hypothesis
                    hypotheses_data.append({'diagram': diagram_number, 'cause': cause, 'effect': effect})

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return hypotheses_df

In [3]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
hypotheses_df = load_yaml_data(folder_path)

In [4]:
hypotheses_df

Unnamed: 0,diagram,cause,effect
0,1,FITIC,Firm Performance
1,1,Business Innovation Capability,FITIC
2,1,IT Innovation Capability,FITIC
3,1,Social Innovation Capability,FITIC
4,1,Economic/Financial Performance,Firm Performance
...,...,...,...
576,99,Trust between Members,Trust for Vendors
577,99,Trust for Vendors,Perceived Risks in Electronic Commerce
578,99,Trust between Members,Perceived Risks in Electronic Commerce
579,99,Perceived Benefits in Virtual Community,Participation in Electronic Commerce


In [5]:
synonyms_df = pd.read_pickle("../resources/synonyms.pkl") 
synonyms_df

Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym,identifier
0,Infrastructure Human Resource Globalization,infrastructure human resource globalization,,,1
1,Change Strategy *Reconfigure Existing Resource...,change strategy reconfigure existing resources...,,,2
2,Organizational Performance,organizational performance,,organizational readiness,3
3,FirmPerf,firmperf,,,4
4,Usage Intention,usage intention,,,5
...,...,...,...,...,...
614,Negative Feedback,negative feedback,,,477
615,Information Technology Application *Codify Kno...,information technology application codify know...,,,478
616,Prior domain knowledge,prior domain knowledge,,explicit information expectations,124
617,Competencies,competencies,Competence,,479


In [6]:
identifier_df = synonyms_df.groupby('identifier').agg({
    'construct': lambda x: list(set(x)),  # List of unique constructs per group
}).reset_index()

with pd.option_context('display.max_colwidth', None):
  display(identifier_df)

Unnamed: 0,identifier,construct
0,1,[Infrastructure Human Resource Globalization]
1,2,[Change Strategy *Reconfigure Existing Resources *Acquire and Reconfigure Resources *Acquire Resources without Configuring *Business as Usual]
2,3,"[Social Performance, Action readiness, Organizational Readiness, Organizational Performance, Environmental Performance]"
3,4,[FirmPerf]
4,5,[Usage Intention]
...,...,...
474,476,[Telecom]
475,477,[Negative Feedback]
476,478,[Information Technology Application *Codify Knowledge *Create Network]
477,479,[Competencies]


In [7]:
type(identifier_df['construct'].loc[identifier_df.index[0]])

list

In [8]:
identifier_df.loc[identifier_df['identifier'] == 227]

Unnamed: 0,identifier,construct
226,227,"[Business Innovation Capability, IT Innovation..."


In [9]:
# Replace empty strings with NaN
hypotheses_df = hypotheses_df.replace('', pd.NA)

# Remove rows where either 'cause' or 'effect' is NaN (which now includes original empty strings)
hypotheses_df = hypotheses_df.dropna(subset=['cause', 'effect'])

In [10]:
# Explode the 'construct' list into separate rows
exploded_identifiers_df = identifier_df.explode('construct')

# Create a dictionary for mapping constructs to identifiers
construct_to_identifier = dict(zip(exploded_identifiers_df['construct'], exploded_identifiers_df['identifier']))

# Define a mapping function
def map_construct_to_identifier(construct):
    return construct_to_identifier.get(construct, None) # Returns None if construct not found

# Apply the mapping function to the 'cause' and 'effect' columns
hypotheses_df['cause_identifier'] = hypotheses_df['cause'].map(map_construct_to_identifier)
hypotheses_df['effect_identifier'] = hypotheses_df['effect'].map(map_construct_to_identifier)

In [11]:
hypotheses_df

Unnamed: 0,diagram,cause,effect,cause_identifier,effect_identifier
0,1,FITIC,Firm Performance,456,49
1,1,Business Innovation Capability,FITIC,227,456
2,1,IT Innovation Capability,FITIC,227,456
3,1,Social Innovation Capability,FITIC,227,456
4,1,Economic/Financial Performance,Firm Performance,409,49
...,...,...,...,...,...
576,99,Trust between Members,Trust for Vendors,207,202
577,99,Trust for Vendors,Perceived Risks in Electronic Commerce,202,452
578,99,Trust between Members,Perceived Risks in Electronic Commerce,207,452
579,99,Perceived Benefits in Virtual Community,Participation in Electronic Commerce,206,277


In [12]:
nan_count = hypotheses_df.isna().any(axis=1).sum()
nan_count

0

In [34]:
hypotheses_df.loc[hypotheses_df['cause_identifier'] == 3]

Unnamed: 0,diagram,cause,effect,cause_identifier,effect_identifier
5,1,Environmental Performance,Firm Performance,3,49
6,1,Social Performance,Firm Performance,3,49
26,11,Organizational Readiness,Intention of RFID Adoption,3,228
122,27,Action readiness,Behaviors,3,141


In [38]:
type(hypotheses_df['cause_identifier'].loc[hypotheses_df.index[0]])

numpy.int64

Chain hypotheses by diagram

In [14]:
import pandas as pd

def create_cause_effect_chains(df):
    # Function to recursively build chains
    def build_chain(cause, current_chain, df, visited):
        if cause not in df['cause'].values or cause in visited:
            return [current_chain]
        chains = []
        visited.add(cause)
        for _, row in df[df['cause'] == cause].iterrows():
            new_chain = current_chain + [row['effect']]
            chains.extend(build_chain(row['effect'], new_chain, df, visited.copy()))
        return chains

    # List to hold all chains
    all_chains = []

    # Iterate over each diagram
    for diagram in df['diagram'].unique():
        diagram_df = df[df['diagram'] == diagram]
        # Start with causes that are not effects in the same diagram
        starting_points = set(diagram_df['cause']) - set(diagram_df['effect'])

        for start in starting_points:
            for chain in build_chain(start, [start], diagram_df, set()):
                all_chains.append({'diagram': diagram, 'chain': ', '.join(chain)})

    # Create a DataFrame for all chains
    chains_df = pd.DataFrame(all_chains)
    return chains_df

# Example usage with your hypotheses_df
chains_df = create_cause_effect_chains(hypotheses_df)


In [15]:
#with pd.option_context('display.max_colwidth', None):
#  display(chains_df)
chains_df

Unnamed: 0,diagram,chain
0,1,"Business Innovation Capability, FITIC, Firm Pe..."
1,1,"Economic/Financial Performance, Firm Performance"
2,1,"Social Performance, Firm Performance"
3,1,"Social Innovation Capability, FITIC, Firm Perf..."
4,1,"IT Innovation Capability, FITIC, Firm Performance"
...,...,...
440,98,"Perceived Website Socialness, Intention to Use..."
441,99,"Involvement in Virtual Community, Trust betwee..."
442,99,"Involvement in Virtual Community, Trust betwee..."
443,99,"Involvement in Virtual Community, Trust for Ve..."


In [16]:
chains_df.loc[chains_df['diagram'] == "3"]

Unnamed: 0,diagram,chain
120,3,"Use of Recommendation Agent, Product Promotion..."
121,3,"Use of Recommendation Agent, Product Promotion..."
122,3,"Use of Recommendation Agent, Product Promotion..."
123,3,"Use of Recommendation Agent, Product Promotion..."
124,3,"Use of Recommendation Agent, Product Promotion..."
125,3,"Use of Recommendation Agent, Product Promotion..."
126,3,"Use of Recommendation Agent, Product Promotion..."
127,3,"Use of Recommendation Agent, Product Promotion..."
128,3,"Use of Recommendation Agent, Product Search Ef..."


Insert the synonyms identifiers for constructs having a synonym

In [17]:
def create_synonym_to_identifier_map(identifier_df):
    # Create a mapping only for constructs with synonyms
    synonym_to_identifier = {}
    for _, row in identifier_df.iterrows():
        constructs = row['construct']
        if len(constructs) > 1:  # Only if there are synonyms
            identifier = row['identifier']
            for construct in constructs:
                synonym_to_identifier[construct] = identifier
    return synonym_to_identifier

In [18]:
def replace_synonyms_with_identifiers(chains_df, synonym_to_identifier):
    # Replace constructs in chains with identifiers if they have synonyms
    for idx, row in chains_df.iterrows():
        chain = row['chain'].split(', ')
        new_chain = [str(synonym_to_identifier.get(construct, construct)) for construct in chain]
        chains_df.at[idx, 'chain'] = ', '.join(new_chain)
    return chains_df

In [19]:
# Create the mapping
construct_to_identifier_map = create_synonym_to_identifier_map(identifier_df)

# Replace constructs in chains with their identifiers
chains_synonyms_df = replace_synonyms_with_identifiers(chains_df, construct_to_identifier_map)

In [20]:
chains_synonyms_df

Unnamed: 0,diagram,chain
0,1,"227, FITIC, Firm Performance"
1,1,"Economic/Financial Performance, Firm Performance"
2,1,"3, Firm Performance"
3,1,"227, FITIC, Firm Performance"
4,1,"227, FITIC, Firm Performance"
...,...,...
440,98,"Perceived Website Socialness, Intention to Use..."
441,99,"Involvement in Virtual Community, Trust betwee..."
442,99,"Involvement in Virtual Community, Trust betwee..."
443,99,"Involvement in Virtual Community, Trust for Ve..."


Find overlapping relationships across different diagrams

In [21]:
def find_relationships_across_diagrams(chains_df):
    relationships = []

    # Convert chains to lists of constructs
    chains_df['chain_list'] = chains_df['chain'].apply(lambda x: x.split(', '))

    # Iterate over all pairs of diagrams
    for diagram1 in chains_df['diagram'].unique():
        for diagram2 in chains_df['diagram'].unique():
            if diagram1 != diagram2:
                chains1 = chains_df[chains_df['diagram'] == diagram1][['chain_list', 'diagram']]
                chains2 = chains_df[chains_df['diagram'] == diagram2][['chain_list', 'diagram']]

                # Find overlapping constructs and extract relationships
                for _, row1 in chains1.iterrows():
                    for _, row2 in chains2.iterrows():
                        for construct in row1['chain_list']:
                            if construct in row2['chain_list']:
                                idx1 = row1['chain_list'].index(construct)
                                idx2 = row2['chain_list'].index(construct)
                                # Extract relationships from overlapping constructs with distance
                                if idx1 > 0 and idx2 < len(row2['chain_list']) - 1:
                                    distance = idx1 + (len(row2['chain_list']) - idx2 - 1)
                                    relationships.append((row1['chain_list'][idx1 - 1], row2['chain_list'][idx2 + 1], row1['diagram'], row2['diagram'], distance))
                                if idx2 > 0 and idx1 < len(row1['chain_list']) - 1:
                                    distance = idx2 + (len(row1['chain_list']) - idx1 - 1)
                                    relationships.append((row2['chain_list'][idx2 - 1], row1['chain_list'][idx1 + 1], row2['diagram'], row1['diagram'], distance))

    # Create DataFrame from relationships
    relationships_df = pd.DataFrame(relationships, columns=['construct1', 'construct2', 'diagram_source1', 'diagram_source2', 'distance'])
    return relationships_df.drop_duplicates()

# Example usage
cross_diagram_relationships_df = find_relationships_across_diagrams(chains_df)

In [29]:
def assign_relationship_identifiers(df):
    # Create a unique identifier for each unique pair of construct1 and construct2
    pair_to_identifier = {}
    identifier = 1
    for idx, row in df.iterrows():
        pair = (row['construct1'], row['construct2'])
        if pair not in pair_to_identifier:
            pair_to_identifier[pair] = identifier
            identifier += 1

    # Assign identifiers to each row in the DataFrame
    df['relationship_id'] = df.apply(lambda row: pair_to_identifier[(row['construct1'], row['construct2'])], axis=1)
    return df

# Apply the function to your DataFrame
cross_diagram_relationships_df = assign_relationship_identifiers(cross_diagram_relationships_df)

In [30]:
cross_diagram_relationships_df

Unnamed: 0,construct1,construct2,diagram_source1,diagram_source2,distance,relationship_id
0,Relational Investment,Firm Performance,11,1,3,1
6,Appraisal,Firm Performance,27,1,3,2
8,4 Customer Value,Firm Performance,4,1,3,3
9,4 Customer Value,Firm Performance,4,1,4,3
10,4 Customer Value,Firm Performance,4,1,5,3
...,...,...,...,...,...,...
1178,Perceived Website Socialness,Intention to Use,98,82,2,146
1237,93,140,92,85,3,225
1239,Involvement in Virtual Community,Attitude,99,85,3,226
1419,IT Quality Triad,Perceived Helpfulness of the eWoM Message,96,87,4,227


In [52]:
#cross_diagram_relationships_df.to_pickle("../resources/cross_diagram_relationships_df.pkl")

Map construct identifiers back to the original names

In [39]:
def create_identifier_to_construct_map(hypotheses_df):
    # Combine cause and effect into a single mapping
    identifier_to_construct = {}
    for _, row in hypotheses_df.iterrows():
        identifier_to_construct[str(row['cause_identifier'])] = row['cause']
        identifier_to_construct[str(row['effect_identifier'])] = row['effect']
    return identifier_to_construct

def replace_identifiers_with_constructs(relationships_df, identifier_to_construct):
    for idx, row in relationships_df.iterrows():
        if row['construct1'].isnumeric():
            relationships_df.at[idx, 'construct1'] = identifier_to_construct.get(row['construct1'], row['construct1'])
        if row['construct2'].isnumeric():
            relationships_df.at[idx, 'construct2'] = identifier_to_construct.get(row['construct2'], row['construct2'])
    return relationships_df

# Create the mapping
identifier_to_construct_map = create_identifier_to_construct_map(hypotheses_df)

# Replace identifiers in relationships DataFrame
cross_diagram_relationships_replaced_df = replace_identifiers_with_constructs(cross_diagram_relationships_df, identifier_to_construct_map)


In [40]:
cross_diagram_relationships_replaced_df

Unnamed: 0,construct1,construct2,diagram_source1,diagram_source2,distance,relationship_id
0,Relational Investment,Firm Performance,11,1,3,1
6,Appraisal,Firm Performance,27,1,3,2
8,4 Customer Value,Firm Performance,4,1,3,3
9,4 Customer Value,Firm Performance,4,1,4,3
10,4 Customer Value,Firm Performance,4,1,5,3
...,...,...,...,...,...,...
1178,Perceived Website Socialness,Intention to Use,98,82,2,146
1237,Fit,Behavioral Intent,92,85,3,225
1239,Involvement in Virtual Community,Attitude,99,85,3,226
1419,IT Quality Triad,Perceived Helpfulness of the eWoM Message,96,87,4,227


In [None]:
#cross_diagram_relationships_replaced_df.to_pickle("../resources/cross_diagram_relationships_replaced_df.pkl")

In [41]:
cross_diagram_relationships_replaced_df.loc[cross_diagram_relationships_replaced_df['relationship_id'] == 3]

Unnamed: 0,construct1,construct2,diagram_source1,diagram_source2,distance,relationship_id
8,4 Customer Value,Firm Performance,4,1,3,3
9,4 Customer Value,Firm Performance,4,1,4,3
10,4 Customer Value,Firm Performance,4,1,5,3
11,4 Customer Value,Firm Performance,4,1,6,3


In [50]:
"""Relationships with a minimal occurence of 2"""

# Count the occurrences of each relationship_id
relationship_counts = cross_diagram_relationships_replaced_df['relationship_id'].value_counts()

# Identify relationship_ids that appear more than once
repeated_ids = relationship_counts[relationship_counts > 1].index.tolist()

# Filter the original DataFrame for rows with repeated relationship_ids
repeated_relationships_df = cross_diagram_relationships_replaced_df[cross_diagram_relationships_replaced_df['relationship_id'].isin(repeated_ids)]

# Create a DataFrame with the necessary columns, including the frequency of each repeated relationship_id
aggregated_df = repeated_relationships_df.groupby('relationship_id').apply(lambda x: pd.Series({
    'construct1': x['construct1'].iloc[0], 
    'construct2': x['construct2'].iloc[0], 
    'amount_of_occurrence': len(x)
})).reset_index()

aggregated_df

Unnamed: 0,relationship_id,construct1,construct2,amount_of_occurrence
0,3,4 Customer Value,Firm Performance,4
1,6,Information Quality,Affective commitment High sacrifice Low altern...,2
2,7,System Quality,Affective commitment High sacrifice Low altern...,2
3,13,Source Trustworthiness,System Quality,2
4,14,eWoM Message Credibility,System Quality,3
5,16,eWoM Message Credibility,Perceived external prestige,2
6,20,IT Quality Triad,Information Quality,2
7,21,IT Quality Triad,System Quality,2
8,25,4 Customer Value,Intention of RFID Adoption,4
9,26,Information processing capacity,Relational Investment,2


In [53]:
#aggregated_df.to_pickle("../resources/relationships_multioccurence_df.pkl")

In [49]:
"""Relationships only appearing once"""


# Count the occurrences of each relationship_id
relationship_counts = cross_diagram_relationships_replaced_df['relationship_id'].value_counts()

# Identify relationship_ids that appear only once
unique_ids = relationship_counts[relationship_counts == 1].index.tolist()

# Create a new DataFrame with rows that have relationship_ids occurring only once
unique_relationships_df = cross_diagram_relationships_replaced_df[cross_diagram_relationships_replaced_df['relationship_id'].isin(unique_ids)]

unique_relationships_df


Unnamed: 0,construct1,construct2,diagram_source1,diagram_source2,distance,relationship_id
0,Relational Investment,Firm Performance,11,1,3,1
6,Appraisal,Firm Performance,27,1,3,2
16,Control Variables Gender,Decisional Balance,100,19,2,4
17,Control Variables - Motivation - Community For...,Frequency of visit of charity websites,19,100,8,5
20,Importance of charity's reputation,Affective commitment High sacrifice Low altern...,100,46,10,8
...,...,...,...,...,...,...
1177,Dynamic Interaction,Enjoyment,82,98,3,224
1237,Fit,Behavioral Intent,92,85,3,225
1239,Involvement in Virtual Community,Attitude,99,85,3,226
1419,IT Quality Triad,Perceived Helpfulness of the eWoM Message,96,87,4,227


In [54]:
#unique_relationships_df.to_pickle("../resources/relationships_unique_df.pkl")