## Импорт библиотек

In [16]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from collections import defaultdict
import math
from tqdm.notebook import tqdm

## Параметры

In [17]:
MAX_FEATURES = 1500
NGRAM_RANGE = (1, 2)
MAX_ITER = 500
SAMPLE_SIZE = 30000

## TF-IDF

In [18]:
class tf_idf:
    def __init__(self, max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE):
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.vocab = {}
        self.idf = {}
    
    def get_ngrams(self, text):
        words = re.findall(r'\w{3,}', text.lower())
        ngrams = []
        for n in range(self.ngram_range[0], min(self.ngram_range[1], 2) + 1):
            ngrams.extend([' '.join(words[i:i+n]) for i in range(len(words)-n+1)])
        return ngrams
    
    def fit(self, texts):
        doc_count = len(texts)
        term_doc_freq = defaultdict(int)
        term_freq = defaultdict(int)
        
        for text in tqdm(texts[:SAMPLE_SIZE], desc="Building vocabulary"):
            ngrams = self.get_ngrams(text)
            seen = set()
            for gram in ngrams:
                term_freq[gram] += 1
                if gram not in seen:
                    term_doc_freq[gram] += 1
                    seen.add(gram)
                    top_terms = sorted(term_freq.items(), key=lambda x: -x[1])[:self.max_features]
        self.vocab = {term: idx for idx, (term, _) in enumerate(top_terms)}
        
        for term in self.vocab:
            self.idf[term] = math.log((doc_count + 1) / (term_doc_freq[term] + 1)) + 1
    
    def transform(self, texts):
        rows = np.zeros((len(texts), len(self.vocab)), dtype=np.float32)
        
        for i, text in enumerate(tqdm(texts, desc="Vectorizing")):
            ngrams = self.get_ngrams(text)
            for gram in ngrams:
                if gram in self.vocab:
                    rows[i, self.vocab[gram]] += 1
        
        return rows * np.array(list(self.idf.values()))

## Логистическая регрессия

In [19]:
class Logistic_regression:
    def __init__(self, learning_rate=0.1, n_iter=MAX_ITER):
        self.lr = learning_rate
        self.n_iter = n_iter
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -20, 20)))
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features, dtype=np.float32)
        self.bias = 0
        
        pos_weight = np.sum(y == 0) / np.sum(y == 1)
        indices = np.arange(n_samples)
        batch_size = min(1000, n_samples)
        
        for _ in tqdm(range(self.n_iter), desc="Training"):
            np.random.shuffle(indices)
            for i in range(0, n_samples, batch_size):
                batch_idx = indices[i:i+batch_size]
                X_batch = X[batch_idx]
                y_batch = y[batch_idx]
                
                linear = np.dot(X_batch, self.weights) + self.bias
                preds = self.sigmoid(linear)
                error = preds - y_batch
                
                grad_w = np.dot(X_batch.T, error * np.where(y_batch == 1, pos_weight, 1)) / len(batch_idx)
                grad_b = np.sum(error * np.where(y_batch == 1, pos_weight, 1)) / len(batch_idx)
                
                self.weights -= self.lr * grad_w
                self.bias -= self.lr * grad_b
    
    def predict(self, X, threshold=0.5):
        return (self.sigmoid(np.dot(X, self.weights) + self.bias) >= threshold).astype(int)


### Функции предобработки

In [20]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]|\d|_', ' ', text)
    return ' '.join(text.split())

def extract_url_features(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        return {
            'url_len': min(len(url), 300),
            'domain_len': len(domain),
            'num_dots': domain.count('.'),
            'is_https': int(parsed.scheme == 'https'),
            'has_adult': int(any(kw in url.lower() for kw in ['porn', 'sex', 'xxx', 'adult']))
        }
    except:
        return {'url_len': 0, 'domain_len': 0, 'num_dots': 0, 'is_https': 0, 'has_adult': 0}

def calculate_f1(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1, precision, recall

In [21]:
def run_pipeline():
    # Загрузка данных
    try:
        train = pd.read_csv('/kaggle/input/ml-2025-spring-porn-detection/train.csv', nrows=SAMPLE_SIZE)
        test = pd.read_csv('/kaggle/input/ml-2025-spring-porn-detection/test.csv')
    except:
        train = pd.read_csv('train.csv', nrows=SAMPLE_SIZE)
        test = pd.read_csv('test.csv')
    
    # Балансировка классов
    pos = train[train['label'] == 1]
    neg = train[train['label'] == 0].sample(min(len(pos)*2, len(pos)+50000), random_state=42)
    train = pd.concat([pos, neg]).sample(frac=1, random_state=42)
    
    # Разделение данных
    split_idx = int(0.9 * len(train))
    train_df = train.iloc[:split_idx]
    val_df = train.iloc[split_idx:]
    
    # Предобработка
    print("Preprocessing...")
    for df in [train_df, val_df, test]:
        df['clean_title'] = df['title'].apply(preprocess_text)
    
    # Извлечение признаков
    print("Extracting URL features...")
    url_train = np.array([list(extract_url_features(url).values()) for url in tqdm(train_df['url'])], dtype=np.float32)
    url_val = np.array([list(extract_url_features(url).values()) for url in tqdm(val_df['url'])], dtype=np.float32)
    url_test = np.array([list(extract_url_features(url).values()) for url in tqdm(test['url'])], dtype=np.float32)
    
    # Векторизация текста
    print("Vectorizing text...")
    vectorizer = tf_idf()
    vectorizer.fit(train_df['clean_title'])
    X_text_train = vectorizer.transform(train_df['clean_title'])
    X_text_val = vectorizer.transform(val_df['clean_title'])
    X_text_test = vectorizer.transform(test['clean_title'])
    
    # Объединение признаков
    X_train = np.hstack([url_train, X_text_train])
    X_val = np.hstack([url_val, X_text_val])
    y_train = train_df['label'].values
    y_val = val_df['label'].values
    X_test = np.hstack([url_test, X_text_test])
    # Обучение модели
    print("Training model...")
    model = Logistic_regression()
    model.fit(X_train, y_train)
    
    # Подбор порога
    print("Finding best threshold...")
    thresholds = np.linspace(0.3, 0.7, 11)
    best_thresh = 0.5
    best_f1 = 0
    
    for thresh in thresholds:
        preds = (model.sigmoid(np.dot(X_val, model.weights) + model.bias) >= thresh).astype(int)
        f1, _, _ = calculate_f1(y_val, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    
    # Оценка
    val_preds = (model.sigmoid(np.dot(X_val, model.weights) + model.bias)) >= best_thresh
    f1, prec, rec = calculate_f1(y_val, val_preds)
    
    print(f"\nBest F1: {best_f1:.4f} at threshold {best_thresh:.2f}")
    print(f"Precision: {prec:.4f}, Recall: {rec:.4f}")
    
    # Предсказание
    test_preds = (model.sigmoid(np.dot(X_test, model.weights) + model.bias) >= best_thresh).astype(int)
    submission = pd.DataFrame({'ID': test['ID'], 'label': test_preds})
    submission.to_csv('submission.csv', index=False)
    print("\nSubmission saved!")
    
if __name__ == "__main__":
    run_pipeline()

Preprocessing...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = df['title'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = df['title'].apply(preprocess_text)


Extracting URL features...


  0%|          | 0/10003 [00:00<?, ?it/s]

  0%|          | 0/1112 [00:00<?, ?it/s]

  0%|          | 0/165378 [00:00<?, ?it/s]

Vectorizing text...


Building vocabulary:   0%|          | 0/10003 [00:00<?, ?it/s]

Vectorizing:   0%|          | 0/10003 [00:00<?, ?it/s]

Vectorizing:   0%|          | 0/1112 [00:00<?, ?it/s]

Vectorizing:   0%|          | 0/165378 [00:00<?, ?it/s]

Training model...


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Finding best threshold...

Best F1: 0.9617 at threshold 0.50
Precision: 0.9686, Recall: 0.9549

Submission saved!
