In [1]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml

c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


Load construct, hypotheses data, and the gold standard

In [2]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [3]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [4]:
# Remove empty elements from constructs list
constructs = [construct for construct in constructs if construct]
print(constructs)
print("Unique constructs: " + str(len(constructs)))

['Eye tracking and verbal protocol analysis', 'Perceived Medium Function', 'Perceived Threat Vulnerability', 'Existing organizational formalcontrols', 'Applying formal risk managment practices', 'Shared understanding', 'Leadership Triad', 'Opportunities for growth Manager practices', 'Trust of Mobile Platform', 'Power Distance', 'Offline Belief', 'Team Dispersion', 'Mimesis', 'Adoption of PACS', 'Information about the organization Competitive pay', 'Behavioral Skills', 'eWoM Message Adoption', 'Negative Feedback', 'cse1', 'Internal Determinants', 'Perceived Quality', 'Results', 'CTC', 'Openness to Novelty', 'When to respond to consumer reviews', 'Homophily', 'CS1', 'ICT Spending per GDP', 'Organizational Dimension High-Level Manager Support', 'Fairness Effect', 'Demand-side', 'cse5', 'Use of Target IT', 'IT Job Entry', 'Positive Feedback', 'Perceived Attributes of Innovations', 'Alertness to Distinction', 'Multiple Technologies', 'Mobile Protection Settings Awareness', 'Interaction of 

In [5]:
hypotheses_df

Unnamed: 0,cause,effect
0,FITIC,Firm Performance
1,Business Innovation Capability,FITIC
2,IT Innovation Capability,FITIC
3,Social Innovation Capability,FITIC
4,Economic/Financial Performance,Firm Performance
...,...,...
576,Trust between Members,Trust for Vendors
577,Trust for Vendors,Perceived Risks in Electronic Commerce
578,Trust between Members,Perceived Risks in Electronic Commerce
579,Perceived Benefits in Virtual Community,Participation in Electronic Commerce


In [6]:
# Path to your Excel file
excel_file_path = '../LarsenBong2016GoldStandard.xls'

df_gold_standard = pd.read_excel(excel_file_path, sheet_name='Items')

gold_standard_list = df_gold_standard['VariableName'].tolist()

# Remove duplicates from gold standard list
gold_standard_list = list(set(gold_standard_list))
print(len(gold_standard_list))



837


Simple mapping of constructs using Lev distance of 3

In [7]:
# Function to find the closest match with a max Levenshtein distance of 2
def find_closest(term, standard_list):
    closest_match = None
    min_distance = 3 # Set to 3 since we are looking for a distance of max 2
    for standard_term in standard_list:
        distance = lev.distance(term, standard_term)
        if distance < min_distance:
            min_distance = distance
            closest_match = standard_term
    return closest_match


In [8]:
# Mapping constructs to the gold standard list with Lev distance
mappings = []
for construct in constructs:
    mapped_term = find_closest(construct, gold_standard_list)
    mappings.append(mapped_term)

# Creating the DataFrame
mapping = pd.DataFrame({
    'construct': constructs,
    'direct_mapping': mappings
})

In [9]:
mapping

Unnamed: 0,construct,direct_mapping
0,Eye tracking and verbal protocol analysis,
1,Perceived Medium Function,
2,Perceived Threat Vulnerability,
3,Existing organizational formalcontrols,
4,Applying formal risk managment practices,
...,...,...
614,Performance Query performance * accuracy * tim...,
615,Review manupulation,
616,Trust for Vendors,
617,Involvement in Virtual Community,


In [10]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]


Unnamed: 0,construct,direct_mapping
412,Fear,


Search for similar terms (synonyms) using Word2Vec

In [11]:
# Cleaning function
def clean_terms(terms):
    cleaned_terms = []
    for term in terms:
        # Lowercasing
        term = term.lower()
        # Removing special characters (preserving spaces)
        term = re.sub(r'[^\w\s]', '', term)
        cleaned_terms.append(term)
    return cleaned_terms

# Cleaning the lists
cleaned_constructs = clean_terms(constructs)
cleaned_gold_standard_list = clean_terms(gold_standard_list)

# Print cleaned data
print("Cleaned Constructs:", cleaned_constructs)
print("Cleaned Gold Standard List:", cleaned_gold_standard_list)


Cleaned Constructs: ['eye tracking and verbal protocol analysis', 'perceived medium function', 'perceived threat vulnerability', 'existing organizational formalcontrols', 'applying formal risk managment practices', 'shared understanding', 'leadership triad', 'opportunities for growth manager practices', 'trust of mobile platform', 'power distance', 'offline belief', 'team dispersion', 'mimesis', 'adoption of pacs', 'information about the organization competitive pay', 'behavioral skills', 'ewom message adoption', 'negative feedback', 'cse1', 'internal determinants', 'perceived quality', 'results', 'ctc', 'openness to novelty', 'when to respond to consumer reviews', 'homophily', 'cs1', 'ict spending per gdp', 'organizational dimension highlevel manager support', 'fairness effect', 'demandside', 'cse5', 'use of target it', 'it job entry', 'positive feedback', 'perceived attributes of innovations', 'alertness to distinction', 'multiple technologies', 'mobile protection settings awareness'

In [12]:
# Combine the lists and remove duplicates
combined_terms = list(set(cleaned_constructs + cleaned_gold_standard_list))

#model = Word2Vec.load('path')
# load a pre-trained model
"""maybe train a model on the data we have?"""
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

# Function to find similar terms within the combined list
def find_similar_within_list(model, terms):
    similar_terms = {}
    for term in terms:
        try:
            # Find top N similar terms in the model
            all_similar = model.most_similar(term, topn=100)
            # Filter to keep only those in our combined list with a similarity score of at least 0.5
            similar_in_list = [sim for sim in all_similar if sim[0] in terms and sim[1] >= 0.5]
            if similar_in_list:  # Only add if the list is not empty
                similar_terms[term] = similar_in_list[:5]  # Top 5 similar terms
        except KeyError:
            # Term not in model's vocabulary
            continue  # Skip this term
    return similar_terms

# Find similar terms
similar_terms = find_similar_within_list(model, combined_terms)

In [13]:
# Print similar terms, only if the list is not empty
for term, similar in similar_terms.items():
    if similar:  # Check if the list of similar terms is not empty
        print(f"{term} \n {similar}\n")

reliability 
 [('efficiency', 0.5621459484100342), ('responsiveness', 0.5531690716743469), ('accuracy', 0.5387457013130188), ('compatibility', 0.5341913104057312), ('usability', 0.5201964378356934)]

fear 
 [('anxiety', 0.578746497631073)]

credibility 
 [('reputation', 0.610754132270813), ('integrity', 0.5867679715156555)]

accuracy 
 [('reliability', 0.5387457013130188), ('timeliness', 0.520466685295105)]

expectation 
 [('expectations', 0.7265158891677856)]

arousal 
 [('cognition', 0.5378036499023438)]

compatibility 
 [('reliability', 0.5341913104057312), ('usability', 0.5166759490966797), ('integration', 0.5010477304458618)]

hostility 
 [('defensiveness', 0.5586022138595581)]

control 
 [('controls', 0.6387393474578857)]

efficiency 
 [('reliability', 0.5621459484100342), ('productivity', 0.5531652569770813)]

democratization 
 [('pluralism', 0.6295616626739502)]

convenience 
 [('accessibility', 0.5054651498794556)]

responsiveness 
 [('reliability', 0.5531690716743469), ('usab

In [14]:
# Synonyms from word2vec
synonyms = pd.DataFrame([(term, synonym) for term, synonyms in similar_terms.items() for synonym, score in synonyms if score >= 0.55],
                  columns=['Term', 'Synonym'])

synonyms.head()

Unnamed: 0,Term,Synonym
0,reliability,efficiency
1,reliability,responsiveness
2,fear,anxiety
3,credibility,reputation
4,credibility,integrity


In [15]:
mapping['synonym'] = None
mapping

Unnamed: 0,construct,direct_mapping,synonym
0,Eye tracking and verbal protocol analysis,,
1,Perceived Medium Function,,
2,Perceived Threat Vulnerability,,
3,Existing organizational formalcontrols,,
4,Applying formal risk managment practices,,
...,...,...,...
614,Performance Query performance * accuracy * tim...,,
615,Review manupulation,,
616,Trust for Vendors,,
617,Involvement in Virtual Community,,


In [16]:
# Create a new column 'construct_lower' with lowercase values of 'construct'
mapping['construct_lower'] = mapping['construct'].str.lower()

# Convert 'Term' column to lowercase in the synonyms DataFrame
synonyms['Term'] = synonyms['Term'].str.lower()

# Rename columns in the synonyms DataFrame for the merge
synonyms.rename(columns={'Term': 'construct_lower', 'Synonym': 'synonym_new'}, inplace=True)

# Merge the DataFrames
mapping = mapping.merge(synonyms, on='construct_lower', how='left')

# If you want to keep the original synonym if new synonym is not found
mapping['synonym'] = mapping['synonym'].combine_first(mapping['synonym_new'])

# Drop the extra synonym column and the temporary construct_lower column
mapping.drop(columns=['synonym_new'], inplace=True)


In [17]:
mapping

Unnamed: 0,construct,direct_mapping,synonym,construct_lower
0,Eye tracking and verbal protocol analysis,,,eye tracking and verbal protocol analysis
1,Perceived Medium Function,,,perceived medium function
2,Perceived Threat Vulnerability,,,perceived threat vulnerability
3,Existing organizational formalcontrols,,,existing organizational formalcontrols
4,Applying formal risk managment practices,,,applying formal risk managment practices
...,...,...,...,...
615,Performance Query performance * accuracy * tim...,,,performance query performance * accuracy * tim...
616,Review manupulation,,,review manupulation
617,Trust for Vendors,,,trust for vendors
618,Involvement in Virtual Community,,,involvement in virtual community


In [18]:
# Rename columns
mapping = mapping.rename(columns={
    'Term': 'construct_lower',
})

# Reposition columns
new_order = ['construct', 'construct_lower', 'direct_mapping', 'synonym']
mapping = mapping[new_order]


In [24]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]

Unnamed: 0,construct,construct_lower,direct_mapping,synonym
413,Fear,fear,,anxiety


In [20]:
mapping

Unnamed: 0,construct,construct_lower,direct_mapping,synonym
0,Eye tracking and verbal protocol analysis,eye tracking and verbal protocol analysis,,
1,Perceived Medium Function,perceived medium function,,
2,Perceived Threat Vulnerability,perceived threat vulnerability,,
3,Existing organizational formalcontrols,existing organizational formalcontrols,,
4,Applying formal risk managment practices,applying formal risk managment practices,,
...,...,...,...,...
615,Performance Query performance * accuracy * tim...,performance query performance * accuracy * tim...,,
616,Review manupulation,review manupulation,,
617,Trust for Vendors,trust for vendors,,
618,Involvement in Virtual Community,involvement in virtual community,,


In [26]:
# Filter the DataFrame to include only rows where there is a direct mapping or a synonym
filtered_mapping = mapping[(mapping['direct_mapping'].notna()) | (mapping['synonym'].notna())]

# Print the filtered DataFrame
filtered_mapping


Unnamed: 0,construct,construct_lower,direct_mapping,synonym
5,Shared understanding,shared understanding,Shared understanding,
9,Power Distance,power distance,Power distance,
18,cse1,cse1,Use,
31,cse5,cse5,Use,
40,Turnover intentions,turnover intentions,Turnover intention,
...,...,...,...,...
563,Trust,trust,Trust,
566,Job Satisfaction,job satisfaction,Job satisfaction,
572,Team Performance,team performance,Team performance,
584,Order,order,Order,
