In [1]:
import autorootcwd

- CODE artificial prediction.txt, posterior probabilities for validation data, for dataset
artificial.
- CODE artificial features.txt, selected features for dataset artificial.
- CODE spam prediction.txt, posterior probabilities for validation data, for dataset
spam.
- CODE spam features.txt, selected features for dataset spam.

In [2]:
import numpy as np
from pathlib import Path

In [3]:
from src.data import artificial, spam
from src.evaluation import evaluate, search_knn, search_rf, search_xgboost

In [4]:
output_folder = Path("results/")
output_folder.mkdir(exist_ok=True)

## artificial dataset

In [5]:
X_train_artificial, y_train_artificial = artificial.prepare_data()
X_train_artificial.shape, y_train_artificial.shape

((2000, 500), (2000,))

In [6]:
X_test_artificial = artificial.prepare_data(test=True)
X_test_artificial.shape

(600, 500)

In [7]:
artificial_best_features = (48, 241, 318, 338, 378, 433, 455, 472)
best_params_knn = {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}

In [8]:
from sklearn.neighbors import KNeighborsClassifier

X_train_artificial = X_train_artificial[:, artificial_best_features]
X_test_artificial = X_test_artificial[:, artificial_best_features]

knn = KNeighborsClassifier(**best_params_knn)
knn.fit(X_train_artificial, y_train_artificial)

test_preds = knn.predict(X_test_artificial)

# save to artificial_prediction.txt
np.savetxt(output_folder / "artificial_prediction.txt", test_preds, fmt="%d")

# save artificial_features.txt
np.savetxt(output_folder / "artificial_features.txt", artificial_best_features, fmt="%d")

## spam dataset

In [5]:
X_train_spam, y_train_spam = spam.prepare_data()
X_train_spam.shape, y_train_spam.shape

((4572, 7911), (4572,))

In [6]:
X_test_spam = spam.prepare_data(test=True)
X_test_spam.shape

(1000, 7911)

In [7]:
spam_best_features = (255, 302, 318, 533, 942, 1130, 1181, 1602, 1632, 1655, 1700, 1769, 1851, 1909,
 1932, 1937, 2129, 2428, 3005, 3047, 3082, 3269, 3330, 3431, 3439, 3590, 3605, 3671,
 3712, 3821, 3833, 3840, 4113, 4252, 4340, 4500, 4548, 4574, 4635, 4706, 4758, 4857,
 4945, 5017, 5034, 5076, 5110, 5357, 5535, 5748, 5845, 5919, 6149, 6171, 6406, 6426,
 6641, 6943, 6969, 7014, 7093, 7120, 7256, 7284, 7357, 7360, 7555, 7583, 7664, 7691,
 7718, 7747, 7811, 7872, 7877)
best_params_rf = {'n_estimators': 200}

In [8]:
from sklearn.ensemble import RandomForestClassifier

X_train_spam = X_train_spam[:, spam_best_features]
X_test_spam = X_test_spam[:, spam_best_features]

rf = RandomForestClassifier(**best_params_rf)
rf.fit(X_train_spam, y_train_spam)

test_preds = rf.predict(X_test_spam)

# save to spam_prediction.txt
np.savetxt(output_folder / "spam_prediction.txt", test_preds, fmt="%d")

# save spam_features.txt
np.savetxt(output_folder / "spam_features.txt", spam_best_features, fmt="%d")
