In [None]:
# установка библиотек
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
# выбор языковой модели
from unsloth import FastLanguageModel
import torch
max_seq_length = 7000
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-3-4b-it",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
# инициализация датасетов
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
    return { "text" : texts, }
pass


from datasets import load_dataset

#dataset = load_dataset('json', data_files="/content/english_dataset_train.jsonl", split = "train")
#test_dataset = load_dataset('json', data_files="/content/english_dataset_test.jsonl", split = "train")

dataset = load_dataset('json', data_files="/content/russian_dataset_rdf_train.jsonl", split = "train")
test_dataset = load_dataset('json', data_files="/content/russian_dataset_rdf_test.jsonl", split = "train")

dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
# настройка параметров fine-tuning
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        warmup_steps = 3,
        #max_steps = 7,
        num_train_epochs=2,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 3,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        gradient_checkpointing = True
    ),
)

In [None]:
# дообучение с выводом значений функции потерь
trainer_stats = trainer.train()

In [None]:
# обозначение англоязычного промта
instruction_for_llm = "extract the detailed rdf from this message in N-triple format. Use detailed predicates. Don't use predicate \"<schema:text>\". For example, <person:george_richards> <schema:jobTitle> \"President\", <msg:25> <prop:earlyLandCost> \"4000 USD per month\"^^<unit:USDollarPerMonth>."
instruction_for_dataset = "extract the rdf from this message in N-tripple format"
message1 = r""""""

message2 = r"""--- Message 1418 ---    Date: Sun, 18 Nov 2001 17:56:21 -0800 (PST)    From: technology.enron@enron.com    To: rudy.acevedo@enron.com, dipak.agarwalla@enron.com,     Welcome to Enron Center South. As you have probably already noticed, things look a little different here. We wanted to orientate you to your new environment and inform you about the future technology coming on-line in the coming weeks.     Occupancy Guidebook To Enron Center South    You will find this handy guide on your desk; it contains information concerning the enhanced printer environment, the new easy access keyboards and telephony procedures.    NEC Monitors    The NEC monitors have an internal anti-glare screen to help reduce eye strain and eliminate the need to install an additional screen cover.  Additionally, an integrated speaker has been installed on one monitor for each desk.    Turret Users    An "Alliance MX turret quick reference guide" has been placed on your desk; this includes important information about the new features such as "Caller ID" and "Voicemail Indication".    Increased
Information Access    There are many NEC 50" Plasma screens installed on the 5th and 6th floors.  While many of these will be displaying information designed for the specific unit, others will be displaying various television channels.  The audio to these television channels can be accessed via any of your telephony equipment.  Here's how:    Ten audio channels have been set up for access from your either your Avaya telephone, Stentophon, or IPC Turret.  The channels have been defined on Page 15 of your Turret.  To access the audio from either your Avaya or Stentophon, simply dial the extensions shown in the table below:    Channel     Avaya   Stentophon          Weather Channel     12401   801         CNN Headlines News  12402   802         CNN Financial News  12403      803         CNBC        12404   804         MSNBC       12405   805         Bloomberg   12406   806         Financial News Network      12407   807    CNN 12408   808         Fox Sports  12409   809         ESPN1       12410   810         Wireless Telephony    You may notice the cellular phone coverage is not consistent across the floor, and some areas have virtually no coverage at all.  We are implementing a multi-network "in-building" system to provide consistent high quality service for the campus, keeping you in touch while you are on the move.      Wireless LAN    The infrastructure to support Wireless LAN technology is in place and is being tested. We will be implementing multi level encryption and security to keep our intellectual property safe from
eavesdroppers or hackers.    Amtel Replacement    As a move to provide a more flexible "Plug n Play" environment and to help with the reduced desk footprint, we have replaced the Amtel message boxes with Microsoft Exchange Instant Messaging. The Global Messaging Team is testing additional software products to provide some of the features not available with Exchange Instant Messaging; these include one-touch response keys, external LED display, and printing.  Updates will be provided as they become available.    Keeping an Open forum    We will keep you informed of the changes and developments as the migration to
the building continues, please feel free to respond with any comments, queries or suggestions to  mailto:Enron.Center.South.Technology@enron.com"""

message3 = r""" --- Message 1421 ---    Date: Sat, 17 Nov 2001 07:19:05 -0800 (PST)    From: lisa@techxans.org    To: k..allen@enron.com    Please note, this is a one
time ONLY email invitation to selected Power Mart Conference attendees.  If you are available and in Houston in December, please join us at our Techxans Energy  Holiday Mixer.    >> >>>>>>>>>>>>>>>>>  *  <<<<<<<<<<<<<<<<<<<<<<<<<<    Our December Holiday Party is hosted by:     > Association of Information Technology Professional (www.aitphouston.org)     > Association for Women In Computing ( www.awchouston.org )     > Digital Eve ( www.digitaleve.com/houston/index.php )     > Greater Houston Partnership ( www.houston.org )     > Houston Java User Group ( www.hjug.org )     > MIT Enterprise Forum of Texas ( www.mitforumtexas.org)     > Techxans (www.techxans.org)     > Women In Technology International ( http://www.witi.org )    >> >>>>>>>>>>>>>>>>>  *  <<<<<<<<<<<<<<<<<<<<<<<<<<    From: Lisa Hoot, Coordinator of Techxans Networking Social    You and your guests  are cordially invited to attend Techxans Holiday Party    =====================================     IT'S ALL ABOUT PERSONAL RELATIONSHIPS    =====================================    WHAT: Techxans Holiday Party            Happy Holiday!  Come join us to celebrate this Holiday Season!            Don't miss this great opportunity to network with members from
        leading Houston technology associations.  Have fun, meet old            friends, and make new ones.            >> RSVP ONLINE: www.techxans.org/signup/holidaypartysignup.htm    WHEN: Thursday, December 13, 2001 from 6:00pm - 9:00pm    WHERE:  Gatsby Social Club                        2540 University Blvd.   (713) 874-1310    WHO:  Techxans welcome all Technology and Industry           Executives, CEO, CIO, VP, Directors, Analysts, Consultants,
 and Business Professionals!!  Dates Are Welcome!!           >> INVITE YOUR FRIENDS & COWORKERS <<           >> RSVP ONLINE: www.techxans.org/signup/holidaypartysignup.htm    DRESS:  Cocktail or Business    ADMISSION:  $10.00 (Cash or Check)  - a percentage of the admission                             will be donated to selected charity    GET:    One (1) Free Drink / Appetizer / Win Cool Door Prizes    CORPORATE SPONSOR:     > Applied Computer Research  ( http://www.acrhq.com)    EVENT SPONSOR:     > Southwest Bio Conference - December 4-5 @Hyatt Downtown           ( http://www.biosouthwest.com)    MEDIA SPONSOR:     > Houston Business Journal           ( http://bizjournals.bcentral.com/houston/ )    DONATION BENEFITING:     > "Variety - The Children's Charity
        ( http://www.usvariety.org/main.html )    >> About Techxans (Networking Social Host)    The purpose of Techxans is to promote networking and community    among the professionals in Houston by providing an entertaining    environments for business, technology professionals, and    entrepreneurs to build new friendships, alliances and resources.    Each month we host "Happy Hours" that further promote networking    and social activities.  Join us by registering for our mailing    & invite list. This will allow us to notify you about our events.    Click to www.Techxans.org    >> Applied Computer Research  ( Major Corporate Sponsor )    Increase Your Sales And Marketing Assets!    Since 1972, the Directory of Top Computer Executives has been    the source that computer industry marketers turn to for up-to-date IT    management information. The Directory gives you immediate access to    more than 48,000 key decision makers at over 24,000 of the largest    U.S. and Canadian IT organizations. Plus, over 1,200 new sites are    added each year.    For more information, please visit - www.acrhq.com    >> Association of Information Technology Professional ( Co-Host )    AITP offers opportunities for Information Technology
(IT) leadership and    education through partnerships with industry, government and academia. AITP    provides quality IT related education, information on relevant IT issues and    forums for networking with experienced peers and other IT professionals.    For more information, please visit - www.aitphouston.org    >> Association For Women In Computing ( Co-Host )    AWC's mission is to provide for the technical professional development of    computing specialists and to provide a formidable network, which is a source    of education, expert information and career opportunities for its members.    For more information, please visit - www.awchouston.org    >> Digital Eve ( Co-Host )    Houston's resource for women interested in technology and living a digital    lifestyle! We are part of an international, non-profit women's networking    group who are here to encourage, educate and empower women of all ages,    education levels and various interests in technology and new media.    For more information, please visit - www.digitalevehouston.org    >> Greater Houston Partnership ( Co-Host )    Do you want to be part of the tech community's voice by serving    as the eyes and ears of the local technology community?    Get involved with the Emerging Business Council by contacting    Linda Flores Olson at lfloresolson@houston.org or 713-844-3682.    For more information, please
visit - www.houston.org    >> Houston Java User Group ( Co-Host )    HJUG is dedicated to the use of the Java(TM) Technology and Lifestyle.    We are one of the many Java User Groups worldwide. All of the HJUG    events, study groups and meetings are FREE to all Java enthusiasts. HJUG    has been created to satisfy all the educational needs about Java for all    levels. Therefore, HJUG proposes technical meetings, business meetings,    and study groups about Java.    For more information, please visit - www.hjug.org    >> MIT Enterprise Forum of Texas ( Co-Host )    Since 1984, the MIT Enterprise Forum of Texas,
based in Houston,    Texas, has offered a basic group of services, which includes professional    seminars, start-up clinics, business plan workshops, case presentations,    and networking opportunities with peers, business specialists and venture    capitalists. Most of the local events are held at the Houston Engineering    and Scientific Society (HESS) building.    For more information, please visit - www.mitforumtexas.org    >> Women In Technology International ( Co-Host )    For more than a decade, WITI has successfully provided women in    technology inspiration, education, conferences, on-line services,
 publications and an exceptional worldwide network of resources.    WITI is the first and only international organization solely dedicated to    advancing
women through technology.  WITI's expansion includes the    development of web-based tools, products and services, innovative    support to women entrepreneurs, early-stage ventures, education,    technology centers, and media networks.    For more information, please visit - www.witi.org    >> About Southwesatisfy all the educational needs about Java for all    levels. Therefore, HJUG proposes technical meetings, business meetings,    and study groups about Java.    For more information, please visit - www.hjug.org    >> MIT Enterprise Forum of Texas ( Co-Host )    Since 1984, the MIT Enterprise Forum of Texas,
based in Houston,    Texas, has offered a basic group of services, which includes professional    seminars, start-up clinics, business plan workshops, case presentations,    and networking opportunities with peers, business specialists and venture    capitalists. Most of the local events are held at the Houston Engineering    and Scientific Society (HESS) building.    For more information, please visit - www.mitforumtexas.org    >> Women In Technology International ( Co-Host )    For more than a decade, WITI has successfully provided women in    technology inspiration, education, conferences, on-line services,
 publications and an exceptional worldwide network of resources.    WITI is the first and only international organization solely dedicated to    advancing
women through technology.  WITI's expansion includes the    development of web-based tools, products and services, innovative    support to women entrepreneurs, early-stage ventures, education,    technology centers, and media networks.    For more information, please visit - www.witi.org    >> About Southwesonal ( Co-Host )    For more than a decade, WITI has successfully provided women in    technology inspiration, education, conferences, on-line services,
 publications and an exceptional worldwide network of resources.    WITI is the first and only international organization solely dedicated to    advancing
women through technology.  WITI's expansion includes the    development of web-based tools, products and services, innovative    support to women entrepreneurs, early-stage ventures, education,    technology centers, and media networks.    For more information, please visit - www.witi.org    >> About Southwes publications and an exceptional worldwide network of resources.    WITI is the first and only international organization solely dedicated to    advancing
women through technology.  WITI's expansion includes the    development of web-based tools, products and services, innovative    support to women entrepreneurs, early-stage ventures, education,    technology centers, and media networks.    For more information, please visit - www.witi.org    >> About Southweseurs, early-stage ventures, education,    technology centers, and media networks.    For more information, please visit - www.witi.org    >> About Southwest Bio conference (Event Sponsor)    December 4 - 5, 2001  Hyatt Regency Downtown Houston    The Southwest BIO Venture Conference and Symposium is one of the    nation's premier biotech and health science venture events.  This two-day    event will combine a venture educational symposium with an opportunity
 for linking investors and related financial organizations with select    ventures in biotechnology, healthcare services, medical devices and    life sciences.  If you have an interest in emerging companies in the    biotechnology and health science industry, you need to attend.    Visit - www.biosouthwest.com    *** If you would like to be added to future event mailings,         please click to www.Techxans.org or send an email to         Lisa@Techxans.org
*** If you would like to be removed from future event mailings,         please send an email request to REMOVE@Techxans.org .    *** PLEASE FORWARD THIS EMAIL TO ALL OF YOUR FRIENDS!!"""

In [None]:
# обозначение русскоязычного промта
instruction_for_llm = "extract the detailed rdf from this message in N-triple format. Use detailed predicates. Don't use predicate \"<schema:text>\". For example, <person:george_richards> <schema:jobTitle> \"President\", <msg:25> <prop:earlyLandCost> \"4000 USD per month\"^^<unit:USDollarPerMonth>."
instruction_for_dataset = "extract the rdf from this message in N-tripple format"
message1 = r"""--- Сообщение 929 ---      Дата: Пн, 15 мая 2023 10:30:00 +0300      От: Иван Иванов <ivan.ivanov@example.com>      Кому: Алексей Петров <alexey.petrov@example.com>      Здравствуйте, Алексей,            На прошлой неделе я отправил вам письмо, в котором сообщал, что буду в Москве в пятницу, 19 мая. Однако моя поездка была перенесена. Как я уже говорил, я планирую заказать у наших специалистов подготовку проектных чертежей для нового офиса, и я свяжусь с вами,
как только работа будет завершена.            Что касается реализации проекта по созданию жилого комплекса, я собираюсь сотрудничать с менеджером проекта из Санкт-Петербурга, так как их опыт в получении финансирования от государства для таких объектов гораздо больше. Мы уже работаем с командой архитекторов для финализации планировки и начала составления строительных чертежей. Ваша ценовая предложение по строительству остается конкурентоспособным по сравнению с
другими предложениями.            Я по-прежнему рассматриваю вашу компанию как потенциального подрядчика благодаря вашим хорошим связям в регионе. Как только мы определимся с окончательным составом жилых модулей и планом участка, я вновь свяжусь с вами.            С уважением,      Иван Иванов
"""

message2 = r"""--- Сообщение 929 ---    Дата: Пн, 9 апр 2001 02:34:00 -0700 (PDT)    От: ivan.petrov@enron.com    Кому: alexei.sidorov@aol.com    Алексей,    Суммы, необходимые для заполнения пропусков в Таблице "Б", следующие:    Иванов-Общая сумма контракта составляла 23 600 долларов, из них было оплачено 2 375 долларов,
и осталось выплатить лишь 21 225 долларов.    Кузнецов- 2 150 долларов    Петров- 37 800 долларов    Сергей и Михаил оплатили 3 500 долларов за оценку, и я согласен возместить эти расходы.    Общая сумма наличных, которую я и Иван заплатим продавцам, составляет 5 875 долларов (3 500 за оценку и 2 375 за инженерные работы).     Я не нашёл упоминаний о том, что покупатели должны выплатить эту сумму.    Сообщите, пожалуйста, если нужно что-то еще сделать, прежде чем вы передадите это продавцам для подписания.    Иван
"""

message3 = r"""--- Сообщение 915 ---      Дата: Пн, 11 дек 2000 09:15:00 +0300      От: ivan.petrov@mail.ru      Кому: alexey.sidorov@enron.com          Я поговорю с Лутцем по поводу его доли в оплате юридических расходов.          Основной план продвижения объекта "Касса для экипажей":      1. Рассылать листовки всем владельцам квартир в Сегуин (после этого делать звонки потенциальным покупателям).      2. Рассылать листовки владельцам в Сан-Антонио и Остине (аналогичные объекты).      3. Размещать информацию на различных интернет-платформах.      4. Рекламировать через сеть CIB (отправлять по электронной почте более 2000 брокерам).      5. Включить объект в MLS Остина.      6. Размещать объявления в газетах Сан-Антонио и Остина по воскресеньям.      7. Рассылать по электронной
почте список из примерно 400 потенциальных покупателей и брокеров.      8. Обзванивать наиболее заинтересованных покупателей из этого списка.          > -----Исходное сообщение-----      > От: ivan.petrov@mail.ru      > Кому: alexey.sidorov@enron.com      > Отправлено: Пон, 11 дек 2000 09:45      > Тема: Обновление по проекту      >      > Алексей,      >      > В приложении файл с операционной отчетностью за 2000 год и проформа на 2001. В течение этой недели вышлю актуальный список арендаторов.      >      > (См. вложение: rent_schedule.xls)      >      > По поводу земли в Леандере, я работаю с Ваном по получению кредита и оценки. Отправлю чек на сумму 250 долларов.      >      > Неужели я должен был получить чек от Мэтта Лутца на сумму 333 доллара за часть юридических расходов Брэнды Стоун? Кажется, я его не получал. Можешь уточнить?      >      > Когда освободишься, расскажи, пожалуйста, о стратегии продвижения "Касса для экипажей".      >      > Иван
"""

In [None]:
# выполнение запроса к дообученной модели
def format_prompt(user_input):
    return f"""<start_of_turn>user
{user_input}<end_of_turn>
<start_of_turn>model
"""

def filter_email_headers(message):
    """Фильтрует заголовки email и обрезает пересланную часть"""
    lines = message.split('\n')
    allowed_headers = {'Date', 'From', 'To'}
    filtered_lines = []
    in_headers = True
    is_original = False

    for line in lines:
        # Проверяем, не началась ли пересланная часть
        if "Forwarded by" in line:
            break

        #поиск нужных заголовков и добавление непустых строк
        if in_headers:
            if ':' in line:
                header_name = line.split(':', 1)[0].strip()
                if header_name in allowed_headers:
                    filtered_lines.append(line)
            elif line.strip() == "":
                 in_headers = False
        elif line.strip() != "":
                filtered_lines.append(line)
                is_original = True


    if is_original:
        return "   ".join(filtered_lines)
    else:
         return "Forwarded message"

input = filter_email_headers(message3)
input = message2
prompt = instruction_for_llm + ": " + input

inputs = tokenizer(
    [format_prompt(prompt)],
    return_tensors="pt",
    padding=False
).to("cuda")


outputs = model.generate(**inputs, max_new_tokens=6000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# сохранение дообученных весов

model.save_pretrained("lora_model")
#model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit")