# Machine Learning Baseline (With Retriever)

In [None]:
import pandas as pd
train = pd.read_csv(r'..\data\Dataset\train_final.csv')
test = pd.read_csv(r'..\data\Dataset\test1_final.csv')
test2 = pd.read_csv(r'..\data\Dataset\test2_final.csv')

print("shape: ", train.shape)
display(train.head())
display(train.tail())

print("shape: ", test.shape)
display(test.head())
display(test.tail())

print("shape: ", test2.shape)
display(test2.head())
display(test2.tail())

In [None]:
import gc
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizers import normalizers, pre_tokenizers, trainers, Tokenizer, models
from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from tqdm.auto import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import scipy.sparse as sp

target_col = 'text'

train[target_col] = train[target_col].astype(str).str.strip()
test[target_col] = test[target_col].astype(str).str.strip()

LOWERCASE = False
VOCAB_SIZE = 30522

raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + ([normalizers.Lowercase()] if LOWERCASE else [])
)
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

hq_pers = pd.read_csv('/kaggle/input/persuade-2-0/persuade_2.0_human_scores_demo_id_github.csv')
hq_pers = hq_pers[hq_pers['holistic_essay_score'] > 4]
hq_pers.rename(columns={'full_text': target_col}, inplace=True)

tokenizer_df = pd.concat([test, hq_pers])
dataset = Dataset.from_pandas(tokenizer_df[[target_col]])

def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000][target_col]

raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenizer.save_pretrained('persuade_tokenizer')

tokenized_texts_test = [tokenizer.tokenize(t) for t in test[target_col].tolist()]
tokenized_texts_train = [tokenizer.tokenize(t) for t in train[target_col].tolist()]

def dummy(text):
    return text

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    analyzer='word',
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents='unicode'
)
vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    lowercase=False,
    sublinear_tf=True,
    vocabulary=vocab,
    analyzer='word',
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    strip_accents='unicode'
)
tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

print(tf_train.shape)
print(tf_test.shape)

In [None]:
train_features = pd.read_parquet(r"..\data\Retriever Dataset\RAG_results_train_features.parquet")
test_features = pd.read_parquet(r"..\data\Retriever Dataset\RAG_results_test1_features.parquet")

train_features_sparse = sp.csr_matrix(train_features.values)
test_features_sparse = sp.csr_matrix(test_features.values)

tf_train = sp.hstack([tf_train, train_features_sparse])
tf_test = sp.hstack([tf_test, test_features_sparse])

print(tf_train.shape)
print(tf_test.shape)

y_train = train['generated'].values

estimators = [
    ('mnb', MultinomialNB(alpha=0.02)),
    ('sgd', SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")),
    ('lgb', LGBMClassifier(learning_rate=0.05))
]
weights = [0.05, 0.225, 0.5]

ensemble = VotingClassifier(
    estimators=estimators,
    weights=weights, voting='soft', n_jobs=-1
)
ensemble.fit(tf_train, y_train)
_ = gc.collect()

In [None]:
ensemble

### Predict

In [None]:
tf_vectors = tf_test.toarray()
y_true = test['generated'].values

final_preds_proba = ensemble.predict_proba(tf_vectors)[:, 1]

y_pred = (final_preds_proba >= 0.5).astype(int)

In [None]:
import pandas as pd

df = pd.DataFrame({
    'y_true': y_true, 
    'y_pred': y_pred,
    'predicted_proba': final_preds_proba
})

df.to_csv('ml_prediction_test1.csv', index=False)

print("ml_prediction_test1.csv 已成功保存！")

### Metrices

In [None]:
y_pred = (final_preds_proba >= 0.5).astype(int)

accuracy = (y_pred == y_true).mean()
print(f"Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.metrics import recall_score

recall = recall_score(y_true, y_pred)
print(f"Recall: {recall:.4f}")