In [1]:
import os

# Disable parallelism in tokenizers to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import re
import ast
from tqdm import tqdm
# import ollama
from langchain_ollama import OllamaLLM

In [3]:
df = pd.read_csv('data/sorted_data.csv')

In [4]:
def extract_user_text(code):
    try:
        tree = ast.parse(code)
        names = set()
        for node in ast.walk(tree):
            if isinstance(node, ast.Name):
                names.add(node.id)
            elif isinstance(node, ast.ClassDef):
                names.add(node.name)
            elif isinstance(node, ast.FunctionDef):
                names.add(node.name)
            elif isinstance(node, ast.Constant):
                names.add(node.value)
        comments = re.findall(r'#.*', code)
        user_text = names.union(comments)
        return ''.join([x for x in user_text if len(x) > 4])
    except Exception as e:
        return code.replace('\n','. ').replace('    ', ' ')


In [None]:
import os

# Disable parallelism in tokenizers to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import re
import ast
from tqdm import tqdm
# import ollama
from langchain_ollama import OllamaLLM

df = pd.read_csv('data/sorted_data.csv')

# split the code into text using astor
def extract_user_text(code):
    try:
        tree = ast.parse(code)
        names = set()
        for node in ast.walk(tree):
            if isinstance(node, ast.Name):
                names.add(node.id)
            elif isinstance(node, ast.ClassDef):
                names.add(node.name)
            elif isinstance(node, ast.FunctionDef):
                names.add(node.name)
            elif isinstance(node, ast.Constant):
                names.add(node.value)
        comments = re.findall(r'#.*', code)
        user_text = names.union(comments)
        return ''.join([x for x in user_text if len(x) > 4])
    except Exception as e:
        return code.replace('\n','. ').replace('    ', ' ')

russian_regex = re.compile(r'[а-яА-ЯёЁ]')
english_regex = re.compile(r'[a-zA-Z]')

translate_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
translate_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

def translate_russian_to_english(text_to_translate):
    if type(text_to_translate) != str:
        try: 
            text_to_translate = ''.join(text_to_translate.split())
        except Exception as e:
            print(f"Error while translating Russian to English: {e}")
            return text_to_translate
    words = text_to_translate.split()
    translated_words = []
    for word in words:
        if russian_regex.search(word):
            input_tokens = translate_tokenizer.encode(word, return_tensors="pt")
            output_tokens = translate_model.generate(input_tokens)
            translated_word = translate_tokenizer.decode(output_tokens[0], skip_special_tokens=True)
            translated_words.append(translated_word)
        else:
            translated_words.append(word)
    translated_text = ' '.join(translated_words)
    return translated_text

In [6]:
guard_model = OllamaLLM(model="xe/llamaguard3:latest")


In [7]:
def pipe_full(prompts):
    processed_prompts = [translate_russian_to_english(extract_user_text(prompt)) for prompt in prompts]
    return [guard_model.invoke(prompt) for prompt in processed_prompts]

In [None]:
batch_size = 100  # Adjust the batch size based on your memory and performance requirements
labels = []

for start in tqdm(range(0, len(df), batch_size)):
    end = start + batch_size
    batch_prompts = df['Prompt'][start:end]
    batch_predictions = pipe_full(batch_prompts)
    labels.extend(batch_predictions)
    
    if (start // batch_size + 1) % 5 == 0:  # Save every 500 rows (5 batches of 100)
        df.loc[:len(labels)-1, 'Label'] = labels
        df.to_csv('data/result_with_labels.csv', index=False)
        print(f"Data saved to 'data/result_with_labels.csv' after processing {len(labels)} rows")


In [None]:
df['Label'] = labels
df.to_csv('data/result_with_labels.csv', index=False)
print("Final data saved to 'data/result_with_labels.csv'")