In [14]:
import json
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

data = []
with open('data/qas/combined_dataset_with_responses_and_classification.json', 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zа-я0-9\s]', '', text)
    return text

X = [preprocess_text(item['ModelResponse']) for item in data]
y = [item['Classification'] for item in data]

label_mapping = {'yes': 1, 'no': 0, 'neither': 2}
y = [label_mapping[label] for label in y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Векторизация текста
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Проверка сбалансированности классов
from collections import Counter
print(f"Training class distribution: {Counter(y_train)}")
print(f"Testing class distribution: {Counter(y_test)}")

# Обучение модели с весами классов
model = LogisticRegression(multi_class='ovr', class_weight='balanced')  # Указываем multi_class='ovr' для многоклассовой классификации и добавляем веса классов
model.fit(X_train_vec, y_train)

# Оценка модели
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=['no', 'yes', 'neither'], zero_division=0))


Training class distribution: Counter({1: 760, 0: 448, 2: 59})
Testing class distribution: Counter({1: 197, 0: 114, 2: 6})
              precision    recall  f1-score   support

          no       0.89      0.78      0.83       114
         yes       0.89      0.95      0.92       197
     neither       0.00      0.00      0.00         6

    accuracy                           0.87       317
   macro avg       0.59      0.58      0.58       317
weighted avg       0.87      0.87      0.87       317



In [18]:
# Пример использования модели для предсказания
def classify_text(model, vectorizer, text):
    text = preprocess_text(text)
    text_vec = vectorizer.transform([text])
    prediction = model.predict(text_vec)
    return 'yes' if prediction[0] == 1 else 'no'

new_text = """
I apologize, but there isn't enough information provided to determine whether it's true or not. The year "18" could be referring to any number of years between 1800 and 1899, making it unclear which specific year is being referred to. Could you please provide more context or clarify the question?"""
print(classify_text(model, vectorizer, new_text))


no
