In [6]:
import transformers
from transformers import BitsAndBytesConfig
import torch
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from json import loads, dumps
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import scan_cache_dir
MODEL_PATH = "lzw1008/ConspEmoLLM-v2"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, device_map="auto")
quant_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=quant_config,
    device_map="auto"  # lets HF put it on GPU(s)
)


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

Fetching 3 files:   0%|          | 0/3 [00:37<?, ?it/s]


In [None]:
df = pd.read_json('train_rehydrated.jsonl',lines=True)
print(df['text'])

for x in df['text']:
    print(type(df['_id']))
    #print(df['text'])

0       A great article on what's taking place in Boli...
1       Chris Lehto interviews Ashton Forbes about his...
2       Germany has upset other EU member states by se...
3       Redditors are, just like most social media use...
4       u/DLWzll shared a couple days ago how the Virg...
                              ...                        
4311    I’ll never understand why Feminists say it’s b...
4312    Biden crime family at it again. Caroline Biden...
4313    Gab Comment text:  \n ReclaimTheNet.org @recla...
4314    I made for geopolitics:  \n While the American...
4315    This article is a fascinating look back at the...
Name: text, Length: 4316, dtype: object
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'

In [None]:
class customdataset(Dataset):
    def __init__(self, pandasframe):
        self.dataframe = pandasframe


    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        ids = self.dataframe["_id"].iloc[idx] 
        text = self.dataframe["text"].iloc[idx]
        return ids ,text




In [None]:
training_data = DataLoader( customdataset( pandasframe=df),batch_size=4)
torch.cuda.empty_cache()
model.eval()
rows = []
label = re.compile(r'Label\s*[:\-]?\s*([01])\b')
with torch.no_grad():
    for ids,texts in training_data:
        prompts = [
            f"""Classify the following text strictly as:
0 = non-conspiracy
1 = conspiracy

Text: {t}

Label:
"""
            for t in texts
        ]
    
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length = 384
        ).to(device)

        generate_ids = model.generate(**inputs, max_new_tokens=16,num_beams=1,do_sample=False)

        decoded = tokenizer.batch_decode(generate_ids, skip_special_tokens=True)
        for id, text in zip(ids, decoded):
            m = label.search(text)
            label_id = int(m.group(1)) if m else None
            label_str = {0: "No", 1: "Yes"}.get(label_id, "No")


            rows.append({"_id": id, "conspiracy": label_str, "rawtext": text})

results = pd.DataFrame(rows,columns=["_id","conspiracy","rawtext"])
results.to_json("test.jsonl", orient="records", lines=True)






Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
newdata = pd.read_json("test.jsonl", lines=True)
newdata = newdata.drop(columns=["rawtext"])
newdata.to_json("newtest.jsonl", orient="records", lines=True)
