In [1]:
import os, zipfile
import numpy as np

In [2]:
def load_train_zip(zip_path: str):
    contents, labels, names = [], [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            base = os.path.basename(name)
            if not (base.endswith(".0") or base.endswith(".1")):
                continue
            lab = int(base.rsplit(".", 1)[1])
            contents.append(z.read(name))
            labels.append(lab)
            names.append(name)
    return names, contents, np.asarray(labels, dtype=int)

def load_test_zip(zip_path: str):
    names, contents = [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            contents.append(z.read(name))
            names.append(name)
    return names, contents

In [9]:
names, rtf, labels = load_train_zip("rtf-train.zip")

test_names, test_rtf = load_test_zip("rtf-test.zip")
#names â†’ list of filenames
#rtf â†’ list of the raw file contents (as bytes)
#labels â†’ array of 0s and 1s

In [4]:
print(rtf[0][:200])  # show first 200 bytes


b'{\\rtf1\\ansi\\ansicpg932\\fromhtml1 \\fbidis \\deff0{\\fonttbl\n\r{\\f0\\fswiss\\fcharset128 MS PGothic;}\n\r{\\f1\\fmodern MS Gothic;}\n\r{\\f2\\fnil\\fcharset2 Symbol;}\n\r{\\f3\\fmodern\\fcharset0 Courier New;}}\n\r{\\colortb'


In [5]:
print(labels[:10])  # show first 10 labels

[0 0 1 0 1 0 0 1 0 0]


In [6]:
print(names[:10])  

['data/rtf-2016-07/yarwdqagtmfctawo.0', 'data/rtf-2017-01/vlapnlwrzhatumep.0', 'data/rtf-2016-07/liajuvhvcowkqiao.1', 'data/rtf-2016-07/tfggpbbsdetozajq.0', 'data/rtf-2016-07/tqeleowzttodffvq.1', 'data/rtf-2016-07/xicsmraumxayvxhp.0', 'data/rtf-2017-01/qiyblrruzbnjtpkm.0', 'data/rtf-2016-07/aziomjeaqdsgjufe.1', 'data/rtf-2016-07/hquhijegnyxykfsv.0', 'data/rtf-2017-01/howwynoywxampjji.0']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import balanced_accuracy_score, classification_report
import numpy as np

X = [c.decode('latin1', errors='ignore') for c in rtf]
X_test = [c.decode('latin1', errors='ignore') for c in test_rtf]

# separar train-test
#X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

# numerificar
vectorizer = TfidfVectorizer(max_features=8000, ngram_range=(1,2))  # 1-2 word sequences
X_train = vectorizer.fit_transform(X)
X_test = vectorizer.transform(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
   # "Linear SVM": LinearSVC(class_weight='balanced'),
   # "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1),
}

for name, model in models.items():
    model.fit(X_train, labels)
    y_pred = model.predict(X_test)
    #bal_acc = balanced_accuracy_score(y_test, y_pred)
    #print(f"\nðŸ”¹ {name}")
    #print(f"Balanced Accuracy: {bal_acc:.4f}")
    #print(classification_report(y_test, y_pred, digits=3))

with open("output.csv", "w") as f:
    for name, pred in zip(test_names, y_pred):
        f.write(f"{name};{pred}\n")