In [None]:
import zipfile
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

z = zipfile.ZipFile("spam1-train.zip")

emails, labels = [], []

for name in z.namelist():
    # Labeldatei überspringen
    if name.endswith(".labels"):
        continue
    base = os.path.basename(name)
    if not (base.endswith(".0") or base.endswith(".1")):
        continue
    lab = int(base.split(".")[-1])
    txt = z.read(name).decode("utf-8", errors="ignore").lower()
    emails.append(txt)
    labels.append(lab)

labels = np.array(labels)

In [None]:
# Pipeline: TF-IDF (Wort + Zeichen) -> LinearSVC()
vect_word = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=2)
vect_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), min_df=2)
features = FeatureUnion([("w", vect_word), ("c", vect_char)])
pipe = make_pipeline(features, LinearSVC())

In [None]:
# models = {
#     "MultinomialNB": MultinomialNB(),
#     "ComplementNB": ComplementNB(),
#     "BernoulliNB": BernoulliNB(),
#     "LogisticRegression": LogisticRegression(max_iter=1000),
#     "SGD (log)": SGDClassifier(loss="log_loss", max_iter=1000),
#     "SGD (hinge)": SGDClassifier(loss="hinge", max_iter=1000),
#     "LinearSVC": LinearSVC()
# }

In [None]:
# # Validierungstest
# X_tr, X_val, y_tr, y_val = train_test_split(
#     emails, labels, test_size=0.2, random_state=42, stratify=labels
# )
# pipe.fit(X_tr, y_tr)
# print("BACC:", balanced_accuracy_score(y_val, pipe.predict(X_val)))

In [None]:
# # Daten
# X, y = emails, labels

# # Test
# for name, model in models.items():
#     pipe = make_pipeline(features, model)
#     scores = cross_val_score(pipe, X, y, cv=5, scoring="balanced_accuracy")
#     print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

In [None]:
pipe.fit(emails, labels) 

In [None]:
import os
import glob

test_zip = glob.glob("*test*.zip")[0]

ztest = zipfile.ZipFile(test_zip)
test_emails, test_files = [], []

for name in ztest.namelist():
    if name.endswith(".labels"):
        continue
    base = os.path.basename(name)
    if not base.endswith(".x"):  
        continue
    txt = ztest.read(name).decode("utf-8", errors="ignore").lower()
    test_emails.append(txt)
    test_files.append(name)

In [None]:
predictions = pipe.predict(test_emails)

with open("predictions.txt", "w") as f:
    for path, pred in zip(test_files, predictions):
        f.write(f"{path};{pred}\n")