In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('toxic_comments.csv')

In [8]:
df

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\r\nWhy the edits made under my use...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\r\nMore\r\nI can't make any real suggestions...",0
4,4,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...
159287,159446,""":::::And for the second time of asking, when ...",0
159288,159447,You should be ashamed of yourself \r\n\r\nThat...,0
159289,159448,"Spitzer \r\n\r\nUmm, theres no actual article ...",0
159290,159449,And it looks like it was actually you who put ...,0


------

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['toxic'], test_size=0.2, random_state=42)

In [22]:
model_tfidf = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))
model_tfidf.fit(X_train, y_train)

In [23]:
y_pred = model_tfidf.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1*100}')

F1 Score: 73.35470488402791


-------

In [24]:
tfidf_vectorizer = TfidfVectorizer()

In [25]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df)

In [27]:
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()[0]

In [30]:
sorted_keywords = [word for _, word in sorted(zip(tfidf_scores, feature_names), reverse=True)]

In [31]:
print("Ключевые слова:", sorted_keywords)

Ключевые слова: ['unnamed', 'toxic', 'text']


In [32]:
documents = [
    "Машинное обучение - это интересная область.",
    "Обучение с учителем - ключевой аспект машинного обучения.",
    "Область NLP также связана с машинным обучением."
]

# Создание объекта TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Применение TF-IDF к текстовым данным
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Получение списка ключевых слов и их значения TF-IDF для первого документа
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()[0]

# Сортировка слов по значениям TF-IDF
sorted_keywords = [word for _, word in sorted(zip(tfidf_scores, feature_names), reverse=True)]

print("Ключевые слова:", sorted_keywords)


Ключевые слова: ['это', 'машинное', 'интересная', 'обучение', 'область', 'учителем', 'также', 'связана', 'обучения', 'обучением', 'машинным', 'машинного', 'ключевой', 'аспект', 'nlp']
