In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from transformers import BertTokenizer, BertModel
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml
import torch


  from .autonotebook import tqdm as notebook_tqdm


Load construct, hypotheses data, and the gold standard

In [2]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [3]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [4]:
# Remove empty elements from constructs list
constructs = [construct for construct in constructs if construct]
print(constructs)
print("Unique constructs: " + str(len(constructs)))

['Perceived Attributes of Innovations', 'Socio-economic Changes - Economic Freedom - Unemployment', 'Specialization benefits', 'Perceived efford (R2=0.290)', 'Time priority (reliability, speed)', 'Tie strength', 'Starting Salary', 'Feedback Presence', 'Source Social Connectedness', 'Motivation', 'Firm Performance', 'Dimensions of IT Mindfulness', 'Developement of domain understanding', 'Reactance proneness', 'Perceived Costs of Non-Compliance', 'zeta', 'Willingness to disclose', 'Interaction of Task and Technology (FIT or TTF)', 'IT Capability - IT Business Partner- ships - External IT Linkages - Business Process Integration - IT Management - IT Infrastructure', 'Interface Usage', 'Cost savings', 'Consumption Likelihood', 'Perceived Usefullness', 'Performance: Financial Market Operational', 'Eye tracking and verbal protocol analysis', 'Performance: - Effectiveness - Efficiency', 'Shared understanding', 'Role Values', 'Trust', 'Self-Efficacy', 'Abysmal credibility text comments', 'M-hea

In [5]:
hypotheses_df

Unnamed: 0,cause,effect
0,IT Mindfulness,Alertness to Distinction
1,IT Mindfulness,Awareness of Multiple Perspectives
2,IT Mindfulness,Openness to Novelty
3,IT Mindfulness,Orientation to the Present
4,IT Mindfulness,Continuance Intention
...,...,...
576,Perceived Usefulness,Attitude
577,Perceived Usefulness,Intention to Use
578,Perceived Ease-of-use,Attitude
579,Attitude,Intention to Use


In [6]:
# Path to your Excel file
excel_file_path = '../LarsenBong2016GoldStandard.xls'

df_gold_standard = pd.read_excel(excel_file_path, sheet_name='Items')

gold_standard_list = df_gold_standard['VariableName'].tolist()

# Remove duplicates from gold standard list
gold_standard_list = list(set(gold_standard_list))
print(len(gold_standard_list))



837


Simple mapping of constructs using Lev distance of 3

In [7]:
# Function to find the closest match with a max Levenshtein distance of 2
def find_closest(term, standard_list):
    closest_match = None
    min_distance = 3 # Set to 3 since we are looking for a distance of max 2
    for standard_term in standard_list:
        distance = lev.distance(term, standard_term)
        if distance < min_distance:
            min_distance = distance
            closest_match = standard_term
    return closest_match


In [8]:
# Mapping constructs to the gold standard list with Lev distance
mappings = []
for construct in constructs:
    mapped_term = find_closest(construct, gold_standard_list)
    mappings.append(mapped_term)

# Creating the DataFrame
mapping = pd.DataFrame({
    'construct': constructs,
    'direct_mapping': mappings
})

In [9]:
mapping

Unnamed: 0,construct,direct_mapping
0,Perceived Attributes of Innovations,
1,Socio-economic Changes - Economic Freedom - Un...,
2,Specialization benefits,
3,Perceived efford (R2=0.290),
4,"Time priority (reliability, speed)",
...,...,...
614,Economic Freedom,
615,Abysmal benevolence text comments,
616,Negative Feedback,
617,Impact on Coordination,


In [10]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]


Unnamed: 0,construct,direct_mapping
230,Fear,


Search for similar terms (synonyms) using Word2Vec

In [11]:
# Cleaning function
def clean_terms(terms):
    cleaned_terms = []
    for term in terms:
        # Lowercasing
        term = term.lower()
        # Removing special characters (preserving spaces)
        term = re.sub(r'[^\w\s]', '', term)
        cleaned_terms.append(term)
    return cleaned_terms

# Cleaning the lists
cleaned_constructs = clean_terms(constructs)
cleaned_gold_standard_list = clean_terms(gold_standard_list)

# Print cleaned data
print("Cleaned Constructs:", cleaned_constructs)
print("Cleaned Gold Standard List:", cleaned_gold_standard_list)


Cleaned Constructs: ['perceived attributes of innovations', 'socioeconomic changes  economic freedom  unemployment', 'specialization benefits', 'perceived efford r20290', 'time priority reliability speed', 'tie strength', 'starting salary', 'feedback presence', 'source social connectedness', 'motivation', 'firm performance', 'dimensions of it mindfulness', 'developement of domain understanding', 'reactance proneness', 'perceived costs of noncompliance', 'zeta', 'willingness to disclose', 'interaction of task and technology fit or ttf', 'it capability  it business partner ships  external it linkages  business process integration  it management  it infrastructure', 'interface usage', 'cost savings', 'consumption likelihood', 'perceived usefullness', 'performance financial market operational', 'eye tracking and verbal protocol analysis', 'performance  effectiveness  efficiency', 'shared understanding', 'role values', 'trust', 'selfefficacy', 'abysmal credibility text comments', 'mhealth a

In [12]:
# Combine the lists and remove duplicates
#combined_terms = list(set(cleaned_constructs + cleaned_gold_standard_list))

# Function to encode and normalize phrases using BERT
def encode_normalize_phrases(phrases, model, tokenizer):
    encoded_input = tokenizer(phrases, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Normalize embeddings
    return normalize(model_output.last_hidden_state.mean(dim=1))



In [13]:


# Function to find similar terms within the combined list with higher threshold
def find_similar_within_list(terms, encoded_terms, threshold=0.9, top_n=3):
    similar_terms = {}
    similarity_matrix = cosine_similarity(encoded_terms)

    for i, term in enumerate(terms):
        similar = [(terms[j], similarity_matrix[i][j]) for j in range(len(terms)) if i != j and similarity_matrix[i][j] >= threshold]
        similar.sort(key=lambda x: x[1], reverse=True)
        if similar:
            similar_terms[term] = similar[:top_n]

    return similar_terms

In [14]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode the terms
encoded_terms = encode_normalize_phrases(cleaned_constructs, model, tokenizer)


# Find similar terms
similar_terms = find_similar_within_list(cleaned_constructs, encoded_terms)

In [15]:
len(similar_terms)

226

In [16]:
# Display similar terms
print("Similar Terms:")
for term, sim_terms in similar_terms.items():
    print(f"{term}: \n{[f'{pair[0]} ({pair[1]:.2f})' for pair in sim_terms]}")

Similar Terms:
socioeconomic changes  economic freedom  unemployment: 
['sociopolitical changes  wealth inequality  political freedom (0.93)', 'sociopolitical change (0.92)']
motivation: 
['incentives (0.91)', 'interpretation (0.90)']
perceived costs of noncompliance: 
['perceived costs of compliance (0.94)', 'perceived benefits of compliance (0.93)']
interaction of task and technology fit or ttf: 
['interaction of technology and cse cset fit (0.90)']
interface usage: 
['interface preference (0.93)']
perceived usefullness: 
['perceived benefit (0.91)', 'perceived usefulness (0.90)', 'perceived usefulness (0.90)']
performance  effectiveness  efficiency: 
['product promotion effectiveness (0.92)', 'product search effectiveness (0.92)', 'perceived effectiveness (0.91)']
business and technology investment: 
['ict investment (0.92)']
perceived usefulness r2055r2057: 
['perceived accuracy r20461 (0.92)']
perceived reputation: 
['perceived benefit (0.92)', 'perceived external prestige (0.91)'

In [18]:
mapping["construct_lower"] = cleaned_constructs

In [19]:
mapping

Unnamed: 0,construct,direct_mapping,construct_lower
0,Perceived Attributes of Innovations,,perceived attributes of innovations
1,Socio-economic Changes - Economic Freedom - Un...,,socioeconomic changes economic freedom unemp...
2,Specialization benefits,,specialization benefits
3,Perceived efford (R2=0.290),,perceived efford r20290
4,"Time priority (reliability, speed)",,time priority reliability speed
...,...,...,...
614,Economic Freedom,,economic freedom
615,Abysmal benevolence text comments,,abysmal benevolence text comments
616,Negative Feedback,,negative feedback
617,Impact on Coordination,,impact on coordination


In [25]:
# Convert the dictionary to a list of tuples (term, top_synonym)
term_synonym_pairs = [(term, synonyms[0][0]) for term, synonyms in similar_terms.items() if synonyms]

# Create a DataFrame from this list
synonym_df = pd.DataFrame(term_synonym_pairs, columns=['term', 'top_synonym'])

# Map the original terms to the terms and their synonyms
mapped_df = synonym_df.merge(mapping, left_on='term', right_on='construct_lower', how='left')

mapped_df.drop(columns=['term','construct_lower'], inplace=True)
mapped_df.rename(columns={'top_synonym': 'consine_synonym'}, inplace=True)
mapped_df = mapped_df[['construct', 'direct_mapping', 'consine_synonym']]
mapped_df


Unnamed: 0,construct,direct_mapping,consine_synonym
0,Socio-economic Changes - Economic Freedom - Un...,,sociopolitical changes wealth inequality pol...
1,Motivation,Motivation,incentives
2,Perceived Costs of Non-Compliance,,perceived costs of compliance
3,Interaction of Task and Technology (FIT or TTF),,interaction of technology and cse cset fit
4,Interface Usage,,interface preference
...,...,...,...
226,Relational Closeness with peer IT Employees,,relational closeness with peer nonit employees
227,Syntactic boundary,,semantic boundary
228,Public Self-Awareness,,private selfawareness
229,International Marketing Orientation,,international entrepreneurial orientation
