In [2]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import Levenshtein as lev
import os
import pandas as pd
import re
import yaml

c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


Load construct, hypotheses data, and the gold standard

In [3]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [4]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [5]:
# Remove empty elements from constructs list
constructs = [construct for construct in constructs if construct]
print(constructs)
print("Unique constructs: " + str(len(constructs)))

['Perceived Quality', 'Attitude Toward Specific Information Security Policy', 'Negative Feedback', 'Privacy Concern', 'Socioeconomic Level', 'Location Disclosure on LB-SNA', 'Functional-Congruity Perspectives', 'Spreadsheet Self-efficacy', 'Internal CSE', '1: Agency Controls - Prior Pref. (+) - Agency Size (-) 2. FISMA Controls (+)', 'Security', 'Preventive Behavior', 'Staff productivity increased', 'Comprehensiveness of Usage (Past Behavior)', 'Cognition*', 'Threat to freedom from new ISP', 'Intention', 'Awareness Signals', 'Willingness to disclose', 'Socio-economic Changes - Economic Freedom - Unemployment', 'Deep Learning Caffe', 'Likelihood of repaying investment', 'Environment: Production Turbulence', 'Crowdsourcing Mturk', 'Theory of Planned Behavior (TPB)', 'Donation information', 'PERFORMANCE', 'eWoM Message Consistency', 'BusFlex', 'Risk beliefs', 'Post-Adoptive Usage', 'Health Promoting Behavior', 'Overall Satisfaction', 'Innovation Compatibility Complexity Trialability Relat

In [6]:
hypotheses_df

Unnamed: 0,cause,effect
0,FITIC,Firm Performance
1,Business Innovation Capability,FITIC
2,IT Innovation Capability,FITIC
3,Social Innovation Capability,FITIC
4,Economic/Financial Performance,Firm Performance
...,...,...
576,Trust between Members,Trust for Vendors
577,Trust for Vendors,Perceived Risks in Electronic Commerce
578,Trust between Members,Perceived Risks in Electronic Commerce
579,Perceived Benefits in Virtual Community,Participation in Electronic Commerce


In [7]:
# Path to your Excel file
excel_file_path = '../LarsenBong2016GoldStandard.xls'

df_gold_standard = pd.read_excel(excel_file_path, sheet_name='Items')

gold_standard_list = df_gold_standard['VariableName'].tolist()

# Remove duplicates from gold standard list
gold_standard_list = list(set(gold_standard_list))
print(len(gold_standard_list))



837


Simple mapping of constructs using Lev distance of 3

In [8]:
# Function to find the closest match with a max Levenshtein distance of 2
def find_closest(term, standard_list):
    closest_match = None
    min_distance = 3 # Set to 3 since we are looking for a distance of max 2
    for standard_term in standard_list:
        distance = lev.distance(term, standard_term)
        if distance < min_distance:
            min_distance = distance
            closest_match = standard_term
    return closest_match


In [9]:
# Mapping constructs to the gold standard list with Lev distance
mappings = []
for construct in constructs:
    mapped_term = find_closest(construct, gold_standard_list)
    mappings.append(mapped_term)

# Creating the DataFrame
mapping = pd.DataFrame({
    'construct': constructs,
    'direct_mapping': mappings
})

In [10]:
mapping

Unnamed: 0,construct,direct_mapping
0,Perceived Quality,
1,Attitude Toward Specific Information Security ...,
2,Negative Feedback,
3,Privacy Concern,Privacy concerns
4,Socioeconomic Level,
...,...,...
614,Trust for Vendors,
615,Personalized directories,
616,IT work experience,
617,Performance,Performance


In [11]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]


Unnamed: 0,construct,direct_mapping
59,Fear,


Search for similar terms (synonyms) using Word2Vec

In [12]:
# Cleaning function
def clean_terms(terms):
    cleaned_terms = []
    for term in terms:
        # Lowercasing
        term = term.lower()
        # Removing special characters (preserving spaces)
        term = re.sub(r'[^\w\s]', '', term)
        cleaned_terms.append(term)
    return cleaned_terms

# Cleaning the lists
cleaned_constructs = clean_terms(constructs)
cleaned_gold_standard_list = clean_terms(gold_standard_list)

# Print cleaned data
print("Cleaned Constructs:", cleaned_constructs)
print("Cleaned Gold Standard List:", cleaned_gold_standard_list)


Cleaned Constructs: ['perceived quality', 'attitude toward specific information security policy', 'negative feedback', 'privacy concern', 'socioeconomic level', 'location disclosure on lbsna', 'functionalcongruity perspectives', 'spreadsheet selfefficacy', 'internal cse', '1 agency controls  prior pref   agency size  2 fisma controls ', 'security', 'preventive behavior', 'staff productivity increased', 'comprehensiveness of usage past behavior', 'cognition', 'threat to freedom from new isp', 'intention', 'awareness signals', 'willingness to disclose', 'socioeconomic changes  economic freedom  unemployment', 'deep learning caffe', 'likelihood of repaying investment', 'environment production turbulence', 'crowdsourcing mturk', 'theory of planned behavior tpb', 'donation information', 'performance', 'ewom message consistency', 'busflex', 'risk beliefs', 'postadoptive usage', 'health promoting behavior', 'overall satisfaction', 'innovation compatibility complexity trialability relative adv

In [13]:
# Combine the lists and remove duplicates
combined_terms = list(set(cleaned_constructs + cleaned_gold_standard_list))

#model = Word2Vec.load('path')
# load a pre-trained model
"""maybe train a model on the data we have?"""
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

# Function to find similar terms within the combined list
def find_similar_within_list(model, terms):
    similar_terms = {}
    for term in terms:
        try:
            # Find top N similar terms in the model
            all_similar = model.most_similar(term, topn=100)
            # Filter to keep only those in our combined list with a similarity score of at least 0.5
            similar_in_list = [sim for sim in all_similar if sim[0] in terms and sim[1] >= 0.5]
            if similar_in_list:  # Only add if the list is not empty
                similar_terms[term] = similar_in_list[:5]  # Top 5 similar terms
        except KeyError:
            # Term not in model's vocabulary
            continue  # Skip this term
    return similar_terms

# Find similar terms
similar_terms = find_similar_within_list(model, combined_terms)

In [14]:
# Print similar terms, only if the list is not empty
for term, similar in similar_terms.items():
    if similar:  # Check if the list of similar terms is not empty
        print(f"{term} \n {similar}\n")

fairness 
 [('integrity', 0.5615886449813843)]

usefulness 
 [('effectiveness', 0.6225259900093079)]

accessibility 
 [('usability', 0.5853327512741089), ('access', 0.5285529494285583), ('convenience', 0.5054651498794556)]

scope 
 [('breadth', 0.5058391690254211)]

monitor 
 [('evaluate', 0.5757278800010681)]

hostility 
 [('defensiveness', 0.5586022138595581)]

effectiveness 
 [('usefulness', 0.6225259900093079)]

understandability 
 [('comprehensiveness', 0.5265683531761169), ('usability', 0.5200535655021667)]

reputation 
 [('credibility', 0.6107540726661682)]

arousal 
 [('cognition', 0.5378036499023438)]

compatibility 
 [('reliability', 0.5341913104057312), ('usability', 0.5166759490966797), ('integration', 0.5010477304458618)]

access 
 [('accessibility', 0.5285529494285583)]

attitude 
 [('aggressiveness', 0.530514121055603)]

credibility 
 [('reputation', 0.610754132270813), ('integrity', 0.5867679715156555)]

integrity 
 [('credibility', 0.5867679119110107), ('fairness', 0.5

In [15]:
# Synonyms from word2vec
synonyms = pd.DataFrame([(term, synonym) for term, synonyms in similar_terms.items() for synonym, score in synonyms if score >= 0.55],
                  columns=['Term', 'Synonym'])

synonyms.head()

Unnamed: 0,Term,Synonym
0,fairness,integrity
1,usefulness,effectiveness
2,accessibility,usability
3,monitor,evaluate
4,hostility,defensiveness


In [16]:
mapping['synonym'] = None
mapping

Unnamed: 0,construct,direct_mapping,synonym
0,Perceived Quality,,
1,Attitude Toward Specific Information Security ...,,
2,Negative Feedback,,
3,Privacy Concern,Privacy concerns,
4,Socioeconomic Level,,
...,...,...,...
614,Trust for Vendors,,
615,Personalized directories,,
616,IT work experience,,
617,Performance,Performance,


In [17]:
# Create a new column 'construct_lower' with lowercase values of 'construct'
mapping['construct_lower'] = mapping['construct'].str.lower()

# Convert 'Term' column to lowercase in the synonyms DataFrame
synonyms['Term'] = synonyms['Term'].str.lower()

# Rename columns in the synonyms DataFrame for the merge
synonyms.rename(columns={'Term': 'construct_lower', 'Synonym': 'synonym_new'}, inplace=True)

# Merge the DataFrames
mapping = mapping.merge(synonyms, on='construct_lower', how='left')

# If you want to keep the original synonym if new synonym is not found
mapping['synonym'] = mapping['synonym'].combine_first(mapping['synonym_new'])

# Drop the extra synonym column and the temporary construct_lower column
mapping.drop(columns=['synonym_new'], inplace=True)


In [18]:
mapping

Unnamed: 0,construct,direct_mapping,synonym,construct_lower
0,Perceived Quality,,,perceived quality
1,Attitude Toward Specific Information Security ...,,,attitude toward specific information security ...
2,Negative Feedback,,,negative feedback
3,Privacy Concern,Privacy concerns,,privacy concern
4,Socioeconomic Level,,,socioeconomic level
...,...,...,...,...
615,Trust for Vendors,,,trust for vendors
616,Personalized directories,,,personalized directories
617,IT work experience,,,it work experience
618,Performance,Performance,,performance


In [19]:
# Rename columns
mapping = mapping.rename(columns={
    'Term': 'construct_lower',
})

# Reposition columns
new_order = ['construct', 'construct_lower', 'direct_mapping', 'synonym']
mapping = mapping[new_order]


In [20]:
matching_rows = mapping['construct'].str.contains("fear", case=False)
mapping[matching_rows]

Unnamed: 0,construct,construct_lower,direct_mapping,synonym
59,Fear,fear,,anxiety


In [21]:
mapping

Unnamed: 0,construct,construct_lower,direct_mapping,synonym
0,Perceived Quality,perceived quality,,
1,Attitude Toward Specific Information Security ...,attitude toward specific information security ...,,
2,Negative Feedback,negative feedback,,
3,Privacy Concern,privacy concern,Privacy concerns,
4,Socioeconomic Level,socioeconomic level,,
...,...,...,...,...
615,Trust for Vendors,trust for vendors,,
616,Personalized directories,personalized directories,,
617,IT work experience,it work experience,,
618,Performance,performance,Performance,


In [22]:
# Filter the DataFrame to include only rows where there is a direct mapping or a synonym
filtered_mapping = mapping[(mapping['direct_mapping'].notna()) | (mapping['synonym'].notna())]

# Print the filtered DataFrame
filtered_mapping


Unnamed: 0,construct,construct_lower,direct_mapping,synonym
3,Privacy Concern,privacy concern,Privacy concerns,
16,Intention,intention,Attention,
29,Risk beliefs,risk beliefs,Risk beliefs,
32,Overall Satisfaction,overall satisfaction,Overall satisfaction,
39,Power Distance,power distance,Power distance,
...,...,...,...,...
554,Facilitating Conditions,facilitating conditions,Facilitating Conditions,
567,Perceived Effectiveness,perceived effectiveness,Perceived effectiveness,
593,Enjoyment,enjoyment,Enjoyment,
596,Behavioral Intention,behavioral intention,Behavioral intention,


In [23]:
# Create a new DataFrame with specified columns
new_df = mapping.assign(
    combined_mapping=lambda df: df['direct_mapping'].combine_first(df['synonym'])
)[['construct', 'combined_mapping']]

# Display the new DataFrame
print(new_df)

                                             construct  combined_mapping
0                                    Perceived Quality               NaN
1    Attitude Toward Specific Information Security ...               NaN
2                                    Negative Feedback               NaN
3                                      Privacy Concern  Privacy concerns
4                                  Socioeconomic Level               NaN
..                                                 ...               ...
615                                  Trust for Vendors               NaN
616                           Personalized directories               NaN
617                                 IT work experience               NaN
618                                        Performance       Performance
619                                       Memorability               NaN

[620 rows x 2 columns]
