In [34]:
import nltk
nltk.download('punkt')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Загрузка данных
data = pd.read_csv('train.csv')
train_data = data[:5000]
# Предварительная обработка текста
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

train_data['processed_text'] = train_data['text'].apply(preprocess_text)

# Создание TF-IDF модели
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_data['processed_text'])
y = train_data['y']

# Обучение классификатора
classifier = make_pipeline(StandardScaler(with_mean=False), SVC())
classifier.fit(X, y)

# Функция для предсказания необходимости выхода в интернет для нового вопроса
def predict_internet_access(question):
    processed_question = preprocess_text(question)
    X_new = vectorizer.transform([processed_question])
    prediction = classifier.predict(X_new)
    if prediction[0] == 1:
        return 1, "Для ответа на этот вопрос требуется выход в интернет."
    else:
        return 0, "Для ответа на этот вопрос выход в интернет не требуется."

# Ввод нового вопроса с клавиатуры
new_question = input("Введите ваш вопрос: ")
print(predict_internet_access(new_question))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['processed_text'] = train_data['text'].apply(preprocess_text)


Введите ваш вопрос: how many fingers does a person have on one hand?
(0, 'Для ответа на этот вопрос выход в интернет не требуется.')


In [25]:
data = pd.read_csv('train.csv')

train_data = data[:5000]
test_data = data[5000:]

In [27]:
test_data

Unnamed: 0,text,y
5000,Who discovered tetanus?,0
5001,Who is the father of modern chemistry?,0
5002,What is the current status of MEA Flight ME 6?,1
5003,What is Linus Pauling famous for?,0
5004,Who was the Roman Emperor during the peak of t...,0
...,...,...
5596,What are the places to visit in Spain?,1
5597,How do you avoid pickpockets when traveling?,0
5598,Who voiced Basil in 'The Great Mouse Detective'?,0
5599,What is the axis of the Earth?,0


In [39]:
true_counter = 0

for t in test_data.itertuples():
    result, text = predict_internet_access(t.text)

    print(f'Question: {t.text}\nResult: {result}\n----------------\n')

    if result == t.y:
      true_counter += 1

true_counter / len(test_data)

Question: Who discovered tetanus?
Result: 0
----------------

Question: Who is the father of modern chemistry?
Result: 0
----------------

Question: What is the current status of MEA Flight ME 6?
Result: 1
----------------

Question: What is Linus Pauling famous for?
Result: 0
----------------

Question: Who was the Roman Emperor during the peak of the Roman Empire?
Result: 1
----------------

Question: What is an example of a commodity?
Result: 0
----------------

Question: What's the best walking route from Akihabara Station to the Ueno Zoo in Tokyo?
Result: 1
----------------

Question: What are the typical foods in Chile?
Result: 1
----------------

Question: Who are the most known Baroque painters?
Result: 0
----------------

Question: Which actor played 'Terminator' in the series?
Result: 0
----------------

Question: What is the Kuiper belt?
Result: 0
----------------

Question: What is the latest research on cancer therapy?
Result: 1
----------------

Question: Who discovered t

0.9168053244592346

In [42]:
from sklearn.metrics import precision_score, recall_score

y_true = test_data['y']
y_pred = test_data['y_pred']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

f1 = (2 * precision * recall) / (precision + recall)

print("F1 равен:", f1)

F1 равен: 0.912280701754386
