In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.svm import LinearSVC

In [36]:

df_train = pd.read_csv("../data/train_model.csv")
df_val   = pd.read_csv("../data/val_model.csv")


In [37]:
X_train_text = df_train["clean_body"].fillna("").astype(str)
y_train      = df_train["rule_violation"]

X_val_text   = df_val["clean_body"].fillna("").astype(str)
y_val        = df_val["rule_violation"]


In [38]:
docs_train = X_train_text.tolist()
docs_val   = X_val_text.tolist()

In [39]:
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)

X_train_tfidf = vectorizer.fit_transform(docs_train)
X_val_tfidf   = vectorizer.transform(docs_val)

In [40]:
clf = LinearSVC(
    C=0.7,
    class_weight=None,  
    max_iter=5000
)

clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_val_tfidf)

print(classification_report(y_val, y_pred))
print("F1:", f1_score(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.67      0.73       200
           1       0.72      0.84      0.78       206

    accuracy                           0.76       406
   macro avg       0.77      0.76      0.76       406
weighted avg       0.77      0.76      0.76       406

F1: 0.7802690582959642
