In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from transformers import BertTokenizer, BertModel
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml
import torch


  from .autonotebook import tqdm as notebook_tqdm


Load construct, hypotheses data, and the gold standard

In [2]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [3]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [4]:
# Remove empty elements from constructs list
constructs = [construct for construct in constructs if construct]
print(constructs)
print("Unique constructs: " + str(len(constructs)))

['Infrastructure Human Resource Globalization', 'Change Strategy *Reconfigure Existing Resources *Acquire and Reconfigure Resources *Acquire Resources without Configuring *Business as Usual', 'Organizational Performance', 'FirmPerf', 'Usage Intention', 'Combined Ontological Completeness', 'Performance', "IT Unit's Improvisation support through: a. Knowledge sharing b. Technology standardization", 'Expectations Disconfirmation', 'Age', 'CS1', 'Cost savings', 'Outstanding credibility text comments', 'Enjoyment', 'IS_Integration', 'cse6', 'Enviroment Competitive pressure Expectations of market trends', 'Consumer characteristics Prior purchase', 'IT activity asset specificity', 'CSE', 'cse-C', 'Quality Management Practices Implementation', 'Codified directories', 'PERFORMANCE', 'Task Behavior: Attention Association', 'Positive Feedback', 'Self-Efficacy', 'Intention to Use a Website', 'Firm Factors', 'Actual self- disclosure in', 'Process Capabilities Consistency Leverage Relevance', 'Proje

In [133]:
hypotheses_df

Unnamed: 0,cause,effect
0,IT Mindfulness,Alertness to Distinction
1,IT Mindfulness,Awareness of Multiple Perspectives
2,IT Mindfulness,Openness to Novelty
3,IT Mindfulness,Orientation to the Present
4,IT Mindfulness,Continuance Intention
...,...,...
576,Perceived Usefulness,Attitude
577,Perceived Usefulness,Intention to Use
578,Perceived Ease-of-use,Attitude
579,Attitude,Intention to Use


In [6]:
# Path to your Excel file
excel_file_path = '../LarsenBong2016GoldStandard.xls'

df_gold_standard = pd.read_excel(excel_file_path, sheet_name='Items')

gold_standard_list = df_gold_standard['VariableName'].tolist()

# Remove duplicates from gold standard list
gold_standard_list = list(set(gold_standard_list))
print(len(gold_standard_list))



837


Simple mapping of constructs using Lev distance of 3

In [7]:
# Function to find the closest match with a max Levenshtein distance of 2
def find_closest(term, standard_list):
    closest_match = None
    min_distance = 3 # Set to 3 since we are looking for a distance of max 2
    for standard_term in standard_list:
        distance = lev.distance(term, standard_term)
        if distance < min_distance:
            min_distance = distance
            closest_match = standard_term
    return closest_match


In [8]:
# Mapping constructs to the gold standard list with Lev distance
mappings = []
for construct in constructs:
    mapped_term = find_closest(construct, gold_standard_list)
    mappings.append(mapped_term)

# Creating the DataFrame
mapping = pd.DataFrame({
    'construct': constructs,
    'direct_mapping': mappings
})

In [9]:
mapping

Unnamed: 0,construct,direct_mapping
0,Infrastructure Human Resource Globalization,
1,Change Strategy *Reconfigure Existing Resource...,
2,Organizational Performance,
3,FirmPerf,
4,Usage Intention,
...,...,...
614,Negative Feedback,
615,Information Technology Application *Codify Kno...,
616,Prior domain knowledge,
617,Competencies,Competence


In [10]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]


Unnamed: 0,construct,direct_mapping
34,Fear,


Search for similar terms (synonyms) using BERT

In [11]:
# Cleaning function
def clean_terms(terms):
    cleaned_terms = []
    for term in terms:
        # Lowercasing
        term = term.lower()
        # Removing special characters (preserving spaces)
        term = re.sub(r'[^\w\s]', '', term)
        cleaned_terms.append(term)
    return cleaned_terms

# Cleaning the lists
cleaned_constructs = clean_terms(constructs)
cleaned_gold_standard_list = clean_terms(gold_standard_list)

# Print cleaned data
print("Cleaned Constructs:", cleaned_constructs)
print("Cleaned Gold Standard List:", cleaned_gold_standard_list)


Cleaned Constructs: ['infrastructure human resource globalization', 'change strategy reconfigure existing resources acquire and reconfigure resources acquire resources without configuring business as usual', 'organizational performance', 'firmperf', 'usage intention', 'combined ontological completeness', 'performance', 'it units improvisation support through a knowledge sharing b technology standardization', 'expectations disconfirmation', 'age', 'cs1', 'cost savings', 'outstanding credibility text comments', 'enjoyment', 'is_integration', 'cse6', 'enviroment competitive pressure expectations of market trends', 'consumer characteristics prior purchase', 'it activity asset specificity', 'cse', 'csec', 'quality management practices implementation', 'codified directories', 'performance', 'task behavior attention association', 'positive feedback', 'selfefficacy', 'intention to use a website', 'firm factors', 'actual self disclosure in', 'process capabilities consistency leverage relevance'

In [12]:
# Combine the lists and remove duplicates
#combined_terms = list(set(cleaned_constructs + cleaned_gold_standard_list))

# Function to encode and normalize phrases using BERT
def encode_normalize_phrases(phrases, model, tokenizer):
    encoded_input = tokenizer(phrases, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Normalize embeddings
    return normalize(model_output.last_hidden_state.mean(dim=1))



In [13]:


# Function to find similar terms within the combined list with higher threshold
def find_similar_within_list(terms, encoded_terms, threshold=0.9, top_n=3):
    similar_terms = {}
    similarity_matrix = cosine_similarity(encoded_terms)

    for i, term in enumerate(terms):
        similar = [(terms[j], similarity_matrix[i][j]) for j in range(len(terms)) if i != j and similarity_matrix[i][j] >= threshold]
        similar.sort(key=lambda x: x[1], reverse=True)
        if similar:
            similar_terms[term] = similar[:top_n]

    return similar_terms

In [14]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode the terms
encoded_terms = encode_normalize_phrases(cleaned_constructs, model, tokenizer)


# Find similar terms
similar_terms = find_similar_within_list(cleaned_constructs, encoded_terms)

In [15]:
len(similar_terms)

226

In [16]:
# Display similar terms
print("Similar Terms:")
for term, sim_terms in similar_terms.items():
    print(f"{term}: \n{[f'{pair[0]} ({pair[1]:.2f})' for pair in sim_terms]}")

Similar Terms:
organizational performance: 
['organizational readiness (0.94)', 'social performance (0.93)', 'decision quality (0.92)']
performance: 
['performance (1.00)']
cs1: 
['cs2 (0.97)', 'cs12 (0.93)']
is_integration: 
['data_integration (0.91)', 'application_integration (0.90)']
cse6: 
['cse5 (0.96)', 'cse4 (0.95)', 'cse3 (0.94)']
cse: 
['csec (0.91)']
csec: 
['csea (0.92)', 'cse (0.91)']
quality management practices implementation: 
['talent development practices implementation (0.96)', 'opportunities for growth manager practices (0.90)', 'decision quality (0.90)']
firm factors: 
['critical factors (0.90)']
process capabilities consistency leverage relevance: 
['organizational dimension highlevel manager support (0.91)']
medication i: 
['medication k (0.95)']
perceived usefullness: 
['perceived benefit (0.91)', 'perceived usefulness (0.90)', 'perceived usefulness (0.90)']
leadership triad: 
['leadership (0.94)']
knowledge process capability: 
['knowledge integration capability

In [17]:
mapping["construct_lower"] = cleaned_constructs

In [18]:
mapping

Unnamed: 0,construct,direct_mapping,construct_lower
0,Infrastructure Human Resource Globalization,,infrastructure human resource globalization
1,Change Strategy *Reconfigure Existing Resource...,,change strategy reconfigure existing resources...
2,Organizational Performance,,organizational performance
3,FirmPerf,,firmperf
4,Usage Intention,,usage intention
...,...,...,...
614,Negative Feedback,,negative feedback
615,Information Technology Application *Codify Kno...,,information technology application codify know...
616,Prior domain knowledge,,prior domain knowledge
617,Competencies,Competence,competencies


In [67]:
# Convert the dictionary to a list of tuples (term, top_synonym)
term_synonym_pairs = [(term, synonyms[0][0]) for term, synonyms in similar_terms.items() if synonyms]

# Create a DataFrame from this list
synonym_df = pd.DataFrame(term_synonym_pairs, columns=['term', 'cosine_synonym'])

# Map the original terms to the terms and their synonyms
#mapped_df = synonym_df.merge(mapping, left_on='term', right_on='construct_lower', how='left')
mapped_df = mapping.merge(synonym_df, left_on='construct_lower', right_on='term', how='left')

mapped_df.drop(columns=['term'], inplace=True)
mapped_df = mapped_df[['construct', 'construct_lower', 'direct_mapping', 'cosine_synonym']]
mapped_df['cosine_synonym'] = mapped_df['cosine_synonym'].apply(lambda x: None if pd.isna(x) else x)
mapped_df


Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym
0,Infrastructure Human Resource Globalization,infrastructure human resource globalization,,
1,Change Strategy *Reconfigure Existing Resource...,change strategy reconfigure existing resources...,,
2,Organizational Performance,organizational performance,,organizational readiness
3,FirmPerf,firmperf,,
4,Usage Intention,usage intention,,
...,...,...,...,...
614,Negative Feedback,negative feedback,,
615,Information Technology Application *Codify Kno...,information technology application codify know...,,
616,Prior domain knowledge,prior domain knowledge,,explicit information expectations
617,Competencies,competencies,Competence,


Assign the same identifiers for synonyms

In [117]:
# Initialize a dictionary for synonyms identifier mapping
identifier_mapping = {}
current_id = 1  # Starting identifier

# Create a dictionary to link constructs with their synonyms
link_dict = {}
for index, row in mapped_df.iterrows():
    construct = row['construct_lower']
    synonym = row['cosine_synonym']

    if pd.notna(synonym) and synonym in mapped_df['construct_lower'].values:
        # Link the construct with its synonym
        link_dict[construct] = synonym
        link_dict[synonym] = construct
    else:
        # If no valid synonym, link the construct to itself
        link_dict[construct] = construct

# Assign identifiers based on linked constructs
for construct in mapped_df['construct_lower']:
    # Check if an identifier has been assigned
    if construct not in identifier_mapping:
        linked_construct = link_dict[construct]

        # If the linked construct also doesn't have an identifier, assign a new one
        if linked_construct not in identifier_mapping:
            identifier_mapping[construct] = current_id
            identifier_mapping[linked_construct] = current_id
            current_id += 1
        else:
            # If the linked construct has an identifier, use the same for this construct
            identifier_mapping[construct] = identifier_mapping[linked_construct]

# Add the identifier to the DataFrame
mapped_df['identifier'] = mapped_df['construct_lower'].apply(lambda x: identifier_mapping[x])
mapped_df

Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym,identifier
0,Infrastructure Human Resource Globalization,infrastructure human resource globalization,,,1
1,Change Strategy *Reconfigure Existing Resource...,change strategy reconfigure existing resources...,,,2
2,Organizational Performance,organizational performance,,organizational readiness,3
3,FirmPerf,firmperf,,,4
4,Usage Intention,usage intention,,,5
...,...,...,...,...,...
614,Negative Feedback,negative feedback,,,477
615,Information Technology Application *Codify Kno...,information technology application codify know...,,,478
616,Prior domain knowledge,prior domain knowledge,,explicit information expectations,124
617,Competencies,competencies,Competence,,479


In [136]:
mapped_df.loc[mapped_df['construct_lower'] == "it mindfulness"]

Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym,identifier
479,IT Mindfulness,it mindfulness,,,394


In [137]:
mapped_df.loc[mapped_df['identifier'] == 394]

Unnamed: 0,construct,construct_lower,direct_mapping,cosine_synonym,identifier
479,IT Mindfulness,it mindfulness,,,394


In [129]:
nan_count = hypotheses_df.isna().any(axis=1).sum()
nan_count

0

In [130]:
type(mapped_df['identifier'].loc[mapped_df.index[1]])

numpy.int64

In [121]:
# Manual adjustments
mapped_df['identifier'] = mapped_df['identifier'].replace(335, 3)

In [123]:
#mapped_df.to_pickle("../resources/synonyms.pkl")