Задание №1 

$ pip install yargy

In [None]:
from yargy import Parser, rule, and_, not_,or_
from yargy.interpretation import fact
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline,pipeline
from yargy.predicates import gram, is_capitalized, dictionary, caseless, normalized, gte, lte
import pandas as pd

In [None]:
import requests

def download_file(url, path):
    try:
        response = requests.get(url)
        response.raise_for_status()  
        with open(path, 'wb') as file:
            file.write(response.content)
        print(f"Скачал файл с {url} к {path}")
    except requests.exceptions.RequestException as e:
        print(f"Ошибка загрузки файла с {url}: {e}")

url = 'https://github.com/XpysTIK/main.git'
path = 'news.txt.gz'
download_file(url, path)


In [None]:
import gzip
from dataclasses import dataclass
from typing import Iterator, List

@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn: str) -> List[Text]:
    texts = []
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            label, title, text = line.strip().split("\t")
            texts.append(Text(label, title, text))
    return texts

texts = read_texts("news.txt.gz")


In [None]:
from yargy import Parser, rule, and_, gte, lte, morph_pipeline, gram, normalized, is_capitalized, or_
from yargy.interpretation import fact
from yargy.predicates import gram

Person = fact("Person", ["name", "birth_date", "birth_place"])
Name = fact("Name", ["first", "last"])
Birth_date = fact('Birth_date', ['day', 'month', 'year'])
Birth_place = fact('Birth_place', ['place'])

Day = and_(gte(1), lte(31))

Month = morph_pipeline([
    "Январь", "Февраль", "Март", "Апрель", "Май", "Июнь", "Июль", "Август", "Сентябрь", "Октябрь", "Ноябрь", "Декабрь"
])

Year = and_(gte(1), lte(2023))

NAME = rule(
    gram("Имя").interpretation(Name.first.inflected()),
    gram("Фамилия").interpretation(Name.last.inflected())
).interpretation(Name)

BIRTH_DATE = rule(
    Day.interpretation(Birth_date.day).optional(),
    Month.interpretation(Birth_date.month).optional(),
    normalized('в').optional(),
    Year.interpretation(Birth_date.year),
    normalized('Год').optional()
).interpretation(Birth_date).optional()

BIRTH_PLACE = rule(
    normalized('в'),
    is_capitalized().interpretation(Birth_place.place)
).interpretation(Birth_place).optional()

BIRTH_DATE_OR_PLACE = or_(
    BIRTH_DATE.interpretation(Person.birth_date),
    BIRTH_PLACE.interpretation(Person.birth_place)
)

PERSON = rule(
    NAME.interpretation(Person.name),
    normalized('Родился'),
    BIRTH_DATE_OR_PLACE,
    BIRTH_DATE_OR_PLACE
).interpretation(Person)

parser = Parser(PERSON)


In [None]:
from tqdm import tqdm

for text in tqdm(texts, disable=False):
  for match in parser.findall(text.text):
    print(match.fact)

Задание №2

In [None]:
#2.1
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

url = 'https://github.com/XpysTIK/main.git'
path = 'news.txt.gz'
download_file(url, path)

tokenized_texts = [word_tokenize(text.lower()) for text in texts]

model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

model.save("word2vec_model")

word_vector = model.wv['авто']
print(word_vector)

In [None]:
#2.2
from sklearn.model_selection import train_test_split
import numpy as np

url = 'https://github.com/XpysTIK/main.git'
path = 'news.txt.gz'
download_file(url, path)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

tokenized_train_texts = [word_tokenize(text.lower()) for text in train_texts]
tokenized_test_texts = [word_tokenize(text.lower()) for text in test_texts]

model = Word2Vec(sentences=tokenized_train_texts, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(model, doc):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

train_vectors = [document_vector(model, doc) for doc in tokenized_train_texts]

test_vectors = [document_vector(model, doc) for doc in tokenized_test_texts]

for i, vector in enumerate(train_vectors):
    print(f"Train Document {i+1} Vector: {vector}")

for i, vector in enumerate(test_vectors):
    print(f"Test Document {i+1} Vector: {vector}")

In [None]:
#2.3
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report

X_train = train_vectors
y_train = train_labels
X_test = test_vectors
y_test = test_labels

classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Точность: {accuracy}")
print("Классификационный отчет:")
print(report)

In [None]:
#2.4
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

url = 'https://github.com/XpysTIK/main.git'
path = 'news.txt.gz'
download_file(url, path)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

classifier_tfidf = SVC(kernel='linear')
classifier_tfidf.fit(X_train_tfidf, train_labels)

y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(test_labels, y_pred_tfidf)
report_tfidf = classification_report(test_labels, y_pred_tfidf)

print(f"TF-IDF Точность: {accuracy_tfidf}")
print("TF-IDF Классификационный отчет:")
print(report_tfidf)

Задание №3

$ pip install transformers

In [None]:
#3.1 Casual
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-Instruct-v1.0")
model = AutoModelForCausalLM.from_pretrained(
    "Upstage/SOLAR-10.7B-Instruct-v1.0",
    device_map="auto",
    torch_dtype=torch.float16,
)

In [None]:
conversation = [ {'role': 'user', 'content': 'Hello?'} ] 

prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device) 
outputs = model.generate(**inputs, use_cache=True, max_length=4096)
output_text = tokenizer.decode(outputs[0]) 
print(output_text)

User:
Hello?

Assistant:
Hello, how can I assist you today? Please feel free to ask any questions or request help with a specific task.


In [None]:
#3.2 Masked
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')
unmasker("Hello I'm a <mask> model.")

[{'sequence': "<s>Hello I'm a male model.</s>",
  'score': 0.3306540250778198,
  'token': 2943,
  'token_str': 'Ġmale'},
 {'sequence': "<s>Hello I'm a female model.</s>",
  'score': 0.04655390977859497,
  'token': 2182,
  'token_str': 'Ġfemale'},
 {'sequence': "<s>Hello I'm a professional model.</s>",
  'score': 0.04232972860336304,
  'token': 2038,
  'token_str': 'Ġprofessional'},
 {'sequence': "<s>Hello I'm a fashion model.</s>",
  'score': 0.037216778844594955,
  'token': 2734,
  'token_str': 'Ġfashion'},
 {'sequence': "<s>Hello I'm a Russian model.</s>",
  'score': 0.03253649175167084,
  'token': 1083,
  'token_str': 'ĠRussian'}]

In [None]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')
unmasker("The man worked as a <mask>.")


unmasker("The Black woman worked as a <mask>.")

[{'sequence': '<s>The man worked as a mechanic.</s>',
  'score': 0.08702439814805984,
  'token': 25682,
  'token_str': 'Ġmechanic'},
 {'sequence': '<s>The man worked as a waiter.</s>',
  'score': 0.0819653645157814,
  'token': 38233,
  'token_str': 'Ġwaiter'},
 {'sequence': '<s>The man worked as a butcher.</s>',
  'score': 0.073323555290699,
  'token': 32364,
  'token_str': 'Ġbutcher'},
 {'sequence': '<s>The man worked as a miner.</s>',
  'score': 0.046322137117385864,
  'token': 18678,
  'token_str': 'Ġminer'},
 {'sequence': '<s>The man worked as a guard.</s>',
  'score': 0.040150221437215805,
  'token': 2510,
  'token_str': 'Ġguard'}]

unmasker("The Black woman worked as a <mask>.")

[{'sequence': '<s>The Black woman worked as a waitress.</s>',
  'score': 0.22177888453006744,
  'token': 35698,
  'token_str': 'Ġwaitress'},
 {'sequence': '<s>The Black woman worked as a prostitute.</s>',
  'score': 0.19288744032382965,
  'token': 36289,
  'token_str': 'Ġprostitute'},
 {'sequence': '<s>The Black woman worked as a maid.</s>',
  'score': 0.06498628109693527,
  'token': 29754,
  'token_str': 'Ġmaid'},
 {'sequence': '<s>The Black woman worked as a secretary.</s>',
  'score': 0.05375480651855469,
  'token': 2971,
  'token_str': 'Ġsecretary'},
 {'sequence': '<s>The Black woman worked as a nurse.</s>',
  'score': 0.05245552211999893,
  'token': 9008,
  'token_str': 'Ġnurse'}]