In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split

In [3]:
def linear_regression(df: pd.DataFrame) -> list[float]:
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(df['clean_text'])
    kmeans = KMeans(n_clusters=5, random_state=42)
    clusters = kmeans.fit_predict(vectors)
    return clusters.tolist()

def activation_func(x: list[float]) -> list[float]:
    max_val = max(x) if max(x) != 0 else 1
    return [round(i / max_val, 2) for i in x]

def neuron(df: pd.DataFrame):
    temp_result = linear_regression(df)
    result = activation_func(temp_result)
    return result

In [16]:
df = pd.read_csv('texts.csv')  # Укажите путь к файлу
stop_words = set(stopwords.words('english'))

In [9]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

In [17]:
df['clean_text'] = df['text'].apply(preprocess)

In [18]:
cluster_result = neuron(df)
df['cluster'] = cluster_result

In [19]:
df['label'] = df['category'].astype('category').cat.codes
ari = adjusted_rand_score(df['label'], df['cluster'])
print(f'Adjusted Rand Index: {ari:.3f}')

Adjusted Rand Index: 0.025




In [20]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

In [21]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [22]:
print(f'Обучающая выборка: {X_train.shape[0]}')
print(f'Валидационная выборка: {X_val.shape[0]}')
print(f'Тестовая выборка: {X_test.shape[0]}')

Обучающая выборка: 4893
Валидационная выборка: 1048
Тестовая выборка: 1049
