In [2]:
%%capture
!pip install lbl2vec

In [3]:
%%capture
import numpy as np
import pandas as pd
import torch
import io
from lbl2vec import Lbl2TransformerVec
from sentence_transformers import SentenceTransformer
from google.colab import files
import matplotlib.pyplot as plt
from google.colab import files

**Upload training and test data**

In [4]:
# Upload the training data
uploaded_training_data = files.upload()

Saving studies_training.csv to studies_training.csv


In [5]:
# Upload the test data
uploaded_test_data = files.upload()

Saving studies_test.csv to studies_test.csv


In [6]:
# Import csv's
studies_df_file = pd.read_csv(io.BytesIO(uploaded_training_data['studies_training.csv']))
studies_df_test_file = pd.read_csv(io.BytesIO(uploaded_test_data['studies_test.csv']))

**Define methods**

In [7]:
# Processing the training dataset
def process_dataset(studies_df):
  if 'index_kw' in studies_df.columns:
    studies_df = studies_df.drop(columns=['index_kw'])
    studies_df = studies_df.dropna(subset=['abstract'])
  studies_df = studies_df.dropna(subset=['auth_kw'])

  studies_df['combined'] = studies_df[['title', 'auth_kw', 'abstract']].apply(lambda x: ' '.join(map(str, x)), axis=1)
  return studies_df

In [8]:
# Training the lbl2vec model
def train_model(studies_df, n_documents):
  n_documents = n_documents
  documents = studies_df['combined'].sample(n_documents).to_list()

  # select sentence-tranformers model
  transformer_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  '''
  # Descriptive keywords
  quantitative_keywords = [
      "statistics", "data", "quantitative", "variable", "regression",
      "correlation", "standard deviation", "mean", "median", "mode",
      "ANOVA", "chi-square", "t-test", "numerical", "ratio", "interval",
      "ordinal", "binary", "continuous", "discrete", "scale", "factor analysis",
      "logistic regression", "linear regression", "predictive modelling",
      "hypothesis testing", "confidence interval", "probability", "normal distribution",
      "population", "sample", "sampling", "survey", "experiment"
  ]

  qualitative_keywords = [
      "qualitative", "interview", "observation", "focus group", "narrative",
      "content analysis", "phenomenology", "ethnography", "case study",
      "grounded theory", "interpretive", "thematic", "discourse", "transcripts",
      "field notes", "participant", "subjective", "descriptive", "exploratory",
      "contextual", "conceptual", "coding", "thematic analysis", "transcription",
      "reflexivity", "interpretative phenomenological analysis", "constant comparison"
  ]
  '''
  # Quantitative Research Approach Keywords:
  quantitative_keywords = [
      "numerical data",
      "quantifiable",
      "statistical analysis",
      "surveys",
      "questionnaires",
      "measure outcomes",
      "correlations",
      "differences between variables",
      "structured methods",
      "(semi-)structured methods",
      "tables",
      "graphs",
      "measurements",
      "statistical methods",
      "causal relationship",
      "correlational relationship",
      "hypotheses testing",
      "experiment",
      "random assignment",
      "independent variable",
      "dependent variable",
      "validity",
      "deductive logic",
      "random sample",
      "large sample",
      "tests",
      "experimental design",
      "quasi-experimental design",
      "descriptive design",
      "methodological design",
      "exploratory design",
      "comparative design",
      "correlational design",
      "developmental design",
      "cross-sectional design",
      "longitudinal design",
      "prospective design",
      "cohort design",
      "retrospective design",
      "ex post facto design",
      "case control design",
      "systematic reviews",
      "meta-analyses",
      "integrative reviews",
      "controlled conditions",
      "numerical output",
      "variable manipulation",
      "sampling error",
      "probability sampling",
      "standard deviation",
      "data modeling",
      "regression analysis",
      "ANOVA",
      "T-test",
      "chi-square",
      "data mining",
      "big data"
  ]

  # Qualitative Research Approach Keywords:
  qualitative_keywords = [
      "phenomenon",
      "interpret",
      "explore",
      "generate understanding",
      "non-numerical data",
      "text data",
      "video data",
      "audio data",
      "interpretive methodologies",
      "subjective methodologies",
      "interviews",
      "observations",
      "analysis of documents",
      "unstructured methods",
      "semi-structured methods",
      "narrative findings",
      "descriptive findings",
      "complexity",
      "contextual",
      "inductive logic",
      "discovery",
      "purposive sample",
      "small sample",
      "focus groups",
      "field observation",
      "phenomenological design",
      "grounded theory design",
      "ethnographic design",
      "case study design",
      "historical research",
      "narrative research",
      "participatory research",
      "clinical research",
      "contextual understanding",
      "thematic analysis",
      "discourse analysis",
      "interpretative phenomenological analysis",
      "content analysis",
      "participant observation",
      "open-ended questions",
      "subjective",
      "reflexivity",
      "naturalistic observation",
      "diary studies",
      "coding",
      "thematic saturation",
      "grounded theory",
      "ethnography",
      "phenomenology"
  ]


  keywords = [quantitative_keywords, qualitative_keywords]

  use_gpu = True
  torch_device = torch.device('cuda:0') if use_gpu else torch.device('cpu')

  # init model
  model = Lbl2TransformerVec(transformer_model=transformer_model, keywords_list=keywords,
                            documents=documents, min_num_docs=100, clean_outliers=False,
                            device=torch_device)

  # train model
  model.fit()
  return model

**Train and evaluate model**

In [9]:
# Train and evaluate the model
n_docs = 300
studies_df = process_dataset(studies_df_file)
model = train_model(studies_df, n_docs)

# Get similarity scores from trained model
docs_df_training = model.predict_model_docs()

# Combine and predict test data
studies_df_test = studies_df_test_file
studies_df_test['combined'] = studies_df_test[['title', 'auth_kw', 'abstract']].apply(lambda x: ' '.join(map(str, x)), axis=1)
test_documents = studies_df_test['combined'].to_list()
docs_df_test = model.predict_new_docs(documents=test_documents, device=torch.device('cuda:0'))

# Getting 33th percentile and 66th percentile
docs_df_training['dif'] = (docs_df_training['label_0'] - docs_df_training['label_1'])/docs_df_training['label_1']
#plt.hist(docs_df_training['dif'], bins=100)
#plt.show()

q1 = np.percentile(docs_df_training['dif'], 33.33)
q2 = np.percentile(docs_df_training['dif'], 66.67)
print(q1,q2)

# Evaluating
from sklearn.metrics import f1_score

# Combine test data
docs_df_test['predicted'] = np.where((docs_df_test['label_0'].sub(docs_df_test['label_1']).div(docs_df_test['label_1']) < q1), 'Qualitative',
                                     np.where((docs_df_test['label_0'].sub(docs_df_test['label_1']).div(docs_df_test['label_1']) >= q1) &
                                              (docs_df_test['label_0'].sub(docs_df_test['label_1']).div(docs_df_test['label_1']) < q2),
                                              'Mixed', 'Quantitative'))
docs_df_test['actual'] = studies_df_test['method']

# Calculate the accuracy
accuracy = (docs_df_test['actual'] == docs_df_test['predicted']).mean() * 100

# Display the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

# Calculate the F1 score
f1 = f1_score(docs_df_test['actual'], docs_df_test['predicted'], average='weighted')

# Display the F1 score
print("F1 Score: {:.4f}".format(f1))

docs_df_test


2023-06-30 08:07:58,692 - Lbl2TransformerVec - INFO - Compute keyword embeddings
INFO:Lbl2TransformerVec:Compute keyword embeddings
2023-06-30 08:08:07,450 - Lbl2TransformerVec - INFO - Compute document embeddings
INFO:Lbl2TransformerVec:Compute document embeddings
2023-06-30 08:08:11,355 - Lbl2TransformerVec - INFO - Train label embeddings
INFO:Lbl2TransformerVec:Train label embeddings
2023-06-30 08:08:11,414 - Lbl2TransformerVec - INFO - Get document embeddings from model
INFO:Lbl2TransformerVec:Get document embeddings from model
2023-06-30 08:08:11,416 - Lbl2TransformerVec - INFO - Calculate document<->label similarities
INFO:Lbl2TransformerVec:Calculate document<->label similarities
2023-06-30 08:08:11,437 - Lbl2TransformerVec - INFO - Compute document embeddings
INFO:Lbl2TransformerVec:Compute document embeddings


-0.01631615913696587 0.016846495363302558
Accuracy: 43.48%
F1 Score: 0.4493


Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,label_0,label_1,predicted,actual
0,0,label_1,0.691211,0.671495,0.691211,Qualitative,Qualitative
1,1,label_0,0.640454,0.640454,0.6081,Quantitative,Quantitative
2,2,label_1,0.637987,0.632321,0.637987,Mixed,Quantitative
3,3,label_0,0.615387,0.615387,0.589434,Quantitative,Quantitative
4,4,label_0,0.76296,0.76296,0.75977,Mixed,Qualitative
5,5,label_0,0.659517,0.659517,0.626487,Quantitative,Quantitative
6,6,label_0,0.783042,0.783042,0.766825,Quantitative,Mixed
7,7,label_1,0.592694,0.582163,0.592694,Qualitative,Qualitative
8,8,label_0,0.559661,0.559661,0.555651,Mixed,Qualitative
9,9,label_0,0.661113,0.661113,0.629402,Quantitative,Mixed
