# Meta-Questions Detection

This notebook demonstrates the process of training the `distil-BERT` model to classify texts into meta and no-meta questions.

Our main evaluation metric is **F1-Score** but we also calculate accuracy.

This notebook runs in the Google Colab enviroment.

# Install the required packages.

In [None]:
!pip install transformers # supports Transformer-based models
!pip install datasets # datasets for experiments
!pip install evaluate # evaluation metrics for experiments
!pip install transformers[torch] # backend for training

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m840.1 kB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m938.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [None]:
from transformers.utils import logging

logging.set_verbosity_error()

Next, import pandas to manipulate data and fix the random seed abnd tqdm to control exec time .

In [None]:
import pandas as pd # data manipulation & storage
from tqdm.auto import tqdm

In [None]:
from transformers import set_seed # fix random seed
set_seed(0)

## Create the dataset


In [2]:
import gdown

In [None]:
train_dataset_url = "https://drive.google.com/uc?export=download&id=1h_V3uZua-x8oeHR9rxlhj_r_gWVqUZqI"

train_dataset_name = "nometa_questions.json"

gdown.download(train_dataset_url, train_dataset_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1h_V3uZua-x8oeHR9rxlhj_r_gWVqUZqI
To: /content/nometa_questions.json
100%|██████████| 242k/242k [00:00<00:00, 77.1MB/s]


'nometa_questions.json'

In [None]:
test_dataset_url = "https://drive.google.com/uc?export=download&id=1Tkw0XbJsYCu4NLgjcdrS3oaKkF4pE4Nb"

test_dataset_name = "test_nometa_questions.json"

gdown.download(test_dataset_url, test_dataset_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1Tkw0XbJsYCu4NLgjcdrS3oaKkF4pE4Nb
To: /content/test_nometa_questions.json
100%|██████████| 9.06k/9.06k [00:00<00:00, 20.5MB/s]


'test_nometa_questions.json'

In [None]:
df_train = pd.read_json("nometa_questions.json")
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.head()

Unnamed: 0,question,label
0,When is it better to choose native app develop...,N
1,Why is data privacy considered crucial in mobi...,N
2,Is there someone who can outline the steps for...,M
3,Can someone share experiences with building re...,M
4,Who sets the standards for web accessibility c...,N


In [None]:
df_test = pd.read_json("test_nometa_questions.json")
df_test.head()

Unnamed: 0,question,label
0,Who can answer a couple of questions about int...,M
1,Anybody out there who got a summer internship ...,M
2,Has anyone already received an offer from ciip...,M
3,Has anyone interned at Yandex in speech techno...,M
4,Has anyone used HireRight for background check...,M


# Add interview software questions

In [3]:
test_dataset_software_questions_url = "https://drive.google.com/uc?export=download&id=1cEgXc-WIcLp0z6cCM_iH7v2zL7WbJXT1"

test_dataset_software_questions_name = "Software Questions.csv"

gdown.download(test_dataset_software_questions_url, test_dataset_software_questions_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1cEgXc-WIcLp0z6cCM_iH7v2zL7WbJXT1
To: /content/Software Questions.csv
100%|██████████| 41.6k/41.6k [00:00<00:00, 38.1MB/s]


'Software Questions.csv'

In [None]:
sq = pd.read_csv('Software Questions.csv', encoding='unicode_escape')

In [None]:
nometa_test = pd.DataFrame([{'question': q, 'label': 'N'} for q in sq.loc[sq['Question'].str.contains('\?', regex=True)][:len(df_test)]['Question']])

In [None]:
df_test = pd.concat([df_test, nometa_test], ignore_index=True)

In [None]:
len(df_test.loc[df_test['label'] == 'M']) / len(df_test)

0.5

## Add real nometa questions to the training dataset

In [4]:
train_dataset_real_url = "https://drive.google.com/uc?export=download&id=1eexPpIC-hsfOclAc2-PrQNCRgWsjRf5y"

train_dataset_real_name = "nometa_questions.json"

gdown.download(train_dataset_real_url, train_dataset_real_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1eexPpIC-hsfOclAc2-PrQNCRgWsjRf5y
To: /content/nometa_questions.json
100%|██████████| 21.8k/21.8k [00:00<00:00, 43.0MB/s]


'nometa_questions.json'

In [None]:
with open("translated_output_nometa.txt", "r", encoding="utf-8") as f:
  nometa_train = pd.DataFrame([{'question': q, 'label': 'M'} for q in f])

In [None]:
print(nometa_train.head())

                                            question label
0                                    Can you help?\n     M
1    A very easy task, anyone willing to solve it?\n     M
2  Hello! Has anyone worked with the Ozon API in ...     M
3  There are kind people who can explain what he ...     M
4  Good evening. Is there anyone who can help wit...     M


In [None]:
df_train = pd.concat([df_train, nometa_train], ignore_index=True)

In [None]:
print(len(df_train))

2354


# Use almost all data from train dataset for the better training performance

In [None]:
from sklearn.model_selection import train_test_split # import the train_test_split function from the sklearn library


# train dataset is just df_train
train, val = train_test_split(df_train, test_size=0.05)

# reset the index of the dataframes after splitting
train.reset_index(inplace=True)
val.reset_index(inplace=True)


In [None]:
from datasets import Dataset, DatasetDict # import necessary modules for creating datasets

# create an empty DatasetDict object
ds = DatasetDict()

# add  datasets to the DatasetDict with specified keys
# each dataset is created from a pandas dataframe (train, val)
ds['train'] = Dataset.from_pandas(train)
ds['validation'] = Dataset.from_pandas(val)

print(ds)


DatasetDict({
    train: Dataset({
        features: ['index', 'question', 'label'],
        num_rows: 2236
    })
    validation: Dataset({
        features: ['index', 'question', 'label'],
        num_rows: 118
    })
})


We define the label convertores.

In [None]:
# map class IDs to labels
id2label = {0: 'N', 1: 'M'}

# map labels to class IDs
label2id = {'N': 0, 'M': 1}


Let start building the model! The first step is to preprocess the texts.

We import the `AutoTokenizer` class from the transformers library.
Then we load a pre-trained tokenizer for the `distilbert-base-uncased` model. A tokenizer is necessary to convert text data into a format that can be fed into the model for processing.

In [None]:
from transformers import AutoTokenizer # import  the AutoTokenizer class from the transformers library

# load a pre-trained tokenizer for the 'distilbert-base-uncased' model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# preprocess the texts by tokenizring them using the tokenizer's dictionary and mapping the labels to their respective ids
def preprocess(batch):

    #tokenize and truncate texts to have 128 tokens and pad, when necessary
    tokenized_batch = tokenizer(batch['question'], padding=True, truncation=True, max_length=128)

    # convert labels
    tokenized_batch['label'] = [label2id[label] for label in batch['label']]

    # return processed data
    return tokenized_batch

This code applies the preprocess function to the dataset ds using batch processing. This means that the function will be applied to the data in chunks or batches, rather than one entry at a time. This can be more memory-efficient and faster.

In [None]:
tokenized_ds = ds.map(preprocess, batched=True)
tokenized_ds

Map:   0%|          | 0/2236 [00:00<?, ? examples/s]

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'question', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2236
    })
    validation: Dataset({
        features: ['index', 'question', 'label', 'input_ids', 'attention_mask'],
        num_rows: 118
    })
})

We create data collator which  responsible for processing the data before feeding it to the model during training.  This data collator will use the provided tokenizer for padding sequences, which is important for making sure all sequences in a batch have the same length.

In [None]:
from transformers import DataCollatorWithPadding

# create an instance of DataCollatorWithPadding
# it takes 'tokenizer' as an argument, which will be used for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
from datasets import load_metric

accuracy = evaluate.load('accuracy') # we will use the accuracy metric as the main one
f1_metric = load_metric("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  f1_metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
import numpy as np # import the numpy package

# this function hets the predictions (e.g. the probilities of each class, takes the most probable precition and compares it to the gold label)
def compute_metrics(eval_pred):

    # get the prediction probabilities and the gold labels
    predictions, labels = eval_pred

    # get the most likely prediction
    predictions = np.argmax(predictions, axis=1)

    accuracy_score =  accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    # compute and return the accuracy value
    return {"f1": f1_score["f1"], "accuracy": accuracy_score["accuracy"]}

Let us define the model architecure. We will use the `distilbert-base-uncased` model as a backbone for binary predicitions.

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer # import necessary components from the transformers library

# initialize a model for sequence classification (e.g. for text classification)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# define the training arguments for the model
training_args = TrainingArguments(
    output_dir=f'tmp/',                           # directory to save the model and results
    learning_rate=2e-5,                            # learning rate for optimization
    per_device_train_batch_size=32,              # batch size per GPU for training
    per_device_eval_batch_size=32,               # batch size per GPU for evaluation
    num_train_epochs=4,                           # number of training epochs
    weight_decay=0.01,                            # weight decay for regularization
    evaluation_strategy='epoch',                  # evaluation strategy during training (per epoch)
    save_strategy='epoch',                        # saving strategy during training (per epoch)
    load_best_model_at_end=True,                  # load the best model at the end of training
)

# intialize the Trainer with necessary components and settings
trainer = Trainer(
    model=model,                                  # model to be trained
    args=training_args,                           # training arguments defined above
    train_dataset=tokenized_ds['train'],          # training dataset
    eval_dataset=tokenized_ds['validation'],      # validation dataset
    tokenizer=tokenizer,                          # tokenizer for data processing
    data_collator=data_collator,                  # data collator for padding
    compute_metrics=compute_metrics               # function to compute evaluation metrics
)




Finally let's train the model!

In [None]:
# train the model
trainer.train()

{'eval_loss': 0.012705561704933643, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.2176, 'eval_samples_per_second': 542.315, 'eval_steps_per_second': 18.384, 'epoch': 1.0}
{'eval_loss': 0.0044427355751395226, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.2045, 'eval_samples_per_second': 577.124, 'eval_steps_per_second': 19.564, 'epoch': 2.0}
{'eval_loss': 0.002521964255720377, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.2064, 'eval_samples_per_second': 571.58, 'eval_steps_per_second': 19.376, 'epoch': 3.0}
{'eval_loss': 0.0022320242132991552, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.2096, 'eval_samples_per_second': 562.936, 'eval_steps_per_second': 19.083, 'epoch': 4.0}
{'train_runtime': 78.6631, 'train_samples_per_second': 113.7, 'train_steps_per_second': 3.559, 'train_loss': 0.06568066052028111, 'epoch': 4.0}


TrainOutput(global_step=280, training_loss=0.06568066052028111, metrics={'train_runtime': 78.6631, 'train_samples_per_second': 113.7, 'train_steps_per_second': 3.559, 'train_loss': 0.06568066052028111, 'epoch': 4.0})

In [None]:
def test_preprocess(batch):
  tokenized_batch = tokenizer(batch['question'], padding=True, truncation=True, max_length=128)
  tokenized_batch['label'] = [label2id[label] for label in batch['label']]
  return tokenized_batch

In [None]:
test_data = Dataset.from_pandas(df_test)
tokenized_test_data = test_data.map(test_preprocess, batched=True)
tokenized_test_data

NameError: name 'test_preprocess' is not defined

In [None]:
# predict on test set
prediction = trainer.predict(tokenized_test_data)

In [None]:
print(f"Accuracy: {prediction.metrics['test_accuracy']}, F1-Score: {prediction.metrics['test_f1']}")

Accuracy: 0.9642857142857143, F1-Score: 0.9642401021711368


## Zero-shot learning

In [None]:
def zero_shot_prompt_template(batch_questions):
   return [f"""
    Task: Detect meta-questions.
    Meta-question - this is a question that implies other questions like:

    May I ask a question?
    Is there anyone experienced in N?
    Has anyone here taken a course from Google on Coursera? I have a text analysis question...
    Guys, I have a question about %framework_name%

    Is the question {question} meta?
  """ for question in batch_questions]

In [None]:
def test_preprocess_zero_shot(batch):
  tokenized_batch = tokenizer(zero_shot_prompt_template(batch_questions=batch['question']), padding=True, truncation=True, max_length=128)
  tokenized_batch['label'] = [label2id[label] for label in batch['label']]
  return tokenized_batch

In [None]:
def preprocess_zero_shot(batch):

    #tokenize and truncate texts to have 128 tokens and pad, when necessary
    tokenized_batch = tokenizer(zero_shot_prompt_template(batch['question']), padding=True, truncation=True, max_length=128)

    # convert labels
    tokenized_batch['label'] = [label2id[label] for label in batch['label']]

    # return processed data
    return tokenized_batch

In [None]:
tokenized_ds_zero_shot = ds.map(preprocess_zero_shot, batched=True)

tokenized_ds_zero_shot

Map:   0%|          | 0/2236 [00:00<?, ? examples/s]

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'question', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2236
    })
    validation: Dataset({
        features: ['index', 'question', 'label', 'input_ids', 'attention_mask'],
        num_rows: 118
    })
})

In [None]:
test_data = Dataset.from_pandas(df_test)
tokenized_test_data_zero_shot = test_data.map(test_preprocess_zero_shot, batched=True)
tokenized_test_data_zero_shot

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'label', 'input_ids', 'attention_mask'],
    num_rows: 168
})

In [None]:
model_zero_shot = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# define the training arguments for the model
training_args = TrainingArguments(
    output_dir=f'tmp/',                           # directory to save the model and results
    learning_rate=2e-5,                            # learning rate for optimization
    per_device_train_batch_size=32,              # batch size per GPU for training
    per_device_eval_batch_size=32,               # batch size per GPU for evaluation
    num_train_epochs=2,                           # number of training epochs
    weight_decay=0.01,                            # weight decay for regularization
    evaluation_strategy='epoch',                  # evaluation strategy during training (per epoch)
    save_strategy='epoch',                        # saving strategy during training (per epoch)
    load_best_model_at_end=True,                  # load the best model at the end of training
)

# intialize the Trainer with necessary components and settings
trainer_zero_shot = Trainer(
    model=model_zero_shot,                        # model to be trained
    args=training_args,                           # training arguments defined above
    train_dataset=tokenized_ds_zero_shot['train'],          # training dataset
    eval_dataset=tokenized_ds_zero_shot['validation'],      # validation dataset
    tokenizer=tokenizer,                          # tokenizer for data processing
    data_collator=data_collator,                  # data collator for padding
    compute_metrics=compute_metrics               # function to compute evaluation metrics
)




In [None]:
trainer_zero_shot.train()

{'eval_loss': 0.06436988711357117, 'eval_f1': 0.983065492462704, 'eval_accuracy': 0.9830508474576272, 'eval_runtime': 0.4297, 'eval_samples_per_second': 274.623, 'eval_steps_per_second': 9.309, 'epoch': 1.0}
{'eval_loss': 0.017726168036460876, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.4198, 'eval_samples_per_second': 281.059, 'eval_steps_per_second': 9.527, 'epoch': 2.0}
{'train_runtime': 58.1127, 'train_samples_per_second': 76.954, 'train_steps_per_second': 2.409, 'train_loss': 0.18976215635027205, 'epoch': 2.0}


TrainOutput(global_step=140, training_loss=0.18976215635027205, metrics={'train_runtime': 58.1127, 'train_samples_per_second': 76.954, 'train_steps_per_second': 2.409, 'train_loss': 0.18976215635027205, 'epoch': 2.0})

In [None]:
prediction = trainer_zero_shot.predict(tokenized_test_data_zero_shot)
print(f"Accuracy: {prediction.metrics['test_accuracy']}, F1-Score: {prediction.metrics['test_f1']}")

Accuracy: 0.9821428571428571, F1-Score: 0.9821422244268858


### Translate the real meta-questions collected from Russian-speaking open-access Telegram chats

In [None]:
!pip install openai==0.28



In [None]:
import openai

# Set your OpenAI API key here
with open('.my_openai_key.txt', 'r') as key_file:
    openai.api_key = key_file.read().strip()

def translate_to_english(text):
    """
    Function to translate a given text to English using OpenAI API.
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a translator."},
                {"role": "user", "content": f"Translate the following text to English:\n\n{text}"}
            ],
            max_tokens=1000,
            n=1,
            stop=None,
            temperature=0.5
        )
        translation = response['choices'][0]['message']['content'].strip()
        return translation
    except Exception as e:
        print(f"Error translating text: {e}")
        return None

def translate_file(input_file, output_file):
    """
    Function to read each line from the input file, translate it to English,
    and write the translated text to the output file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                translated_line = translate_to_english(line)
                if translated_line:
                    outfile.write(translated_line + "\n")
                else:
                    outfile.write(line + "\n")
        print(f"Translation completed. Translated text saved to {output_file}.")
    except Exception as e:
        print(f"Error processing files: {e}")

input_file_path = 'nometa.txt'  # Path to the input file
output_file_path = 'translated_output_nometa.txt'  # Path to save the translated output
translate_file(input_file_path, output_file_path)


Translation completed. Translated text saved to translated_output_nometa.txt.
