# Baseline
Let's use tfidf + pca + class weights + logreg or random forest

In [7]:
import importlib
import modules.preprocess as preprocess
from tqdm import tqdm
import numpy as np
import faiss
importlib.reload(preprocess)
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from multiprocessing import Pool
from joblib import dump, load
random_seed = 42


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_dataset(dataset_path):
    dataset = preprocess.parse_lemmatized(
        dataset_path
    )
    dataset.loc[:, "text"] = preprocess.clean_further(dataset)
    X = dataset.loc[:, "text"]
    with Pool(processes=14) as pool:
        X = np.asarray(list(tqdm(pool.imap(preprocess.unite_string, X, chunksize=(X.shape[0] // 100000)), total=X.shape[0])))
    y = dataset.loc[:, "label"]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)
    return X, y

def count_metrics(y_pred, y_test):
    print(accuracy_score(y_test, y_pred))
    print(f1_score(y_test, y_pred, average="macro"))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro")) 

In [3]:
X, y = load_dataset("preprocessed_serialised/dataset_cleaned_all_title_data_controversial.csv")

  return array(a, dtype, copy=False, order=order)
100%|████████████████████████████████████████████████████████████████████| 1000156/1000156 [00:08<00:00, 113178.60it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000156/1000156 [00:14<00:00, 70496.34it/s]


In [4]:
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english', min_df=5, max_df=0.8)
X = tfidfvectorizer.fit_transform(X)
svd = TruncatedSVD(n_components=500, random_state=random_seed)
X = svd.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=random_seed)
del X
del y

In [16]:
logreg = LogisticRegression(random_state=random_seed, n_jobs=-1, verbose=True)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [19]:
count_metrics(y_pred, y_test)

0.11578440868614279
0.07410277865417389


  _warn_prf(average, modifier, msg_start, len(result))


0.1121736073316617
0.10088937679984555


In [6]:
random_forest = RandomForestClassifier(random_state=random_seed, min_samples_leaf=15, max_depth=10, n_jobs=-1, class_weight="balanced", verbose=True)
random_forest.fit(X_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.0min finished


RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=15, n_jobs=-1, random_state=42,
                       verbose=True)

In [7]:
dump(random_forest, "models/baseline_random_forest.joblib")

['models/baseline_random_forest.joblib']

In [6]:
random_forest = load("models/baseline_random_forest.joblib")

In [7]:
y_pred = random_forest.predict(X_test[:5])
print(y_pred)

[ 627  609 1070  412  660]


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [8]:
# Memory consumtion here is extremely high.
y_pred = np.array([])
for st in range(0, X_test.shape[0], 100000):
    y_pred = np.append(y_pred,(random_forest.predict(X_test[st:min(st + 100000,X_test.shape[0])])))
    

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   35.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.9s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   36.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   35.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   36.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9

In [9]:
count_metrics(y_pred, y_test)

0.03596313076081862
0.02524961457057443


  _warn_prf(average, modifier, msg_start, len(result))


0.04017884369515727
0.036735175574796074


In [6]:
extra_tree = ExtraTreesClassifier(random_state=random_seed, min_samples_leaf=15, max_depth=10, n_jobs=-1, class_weight="balanced", bootstrap=True, verbose=True)
extra_tree.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.4s finished


ExtraTreesClassifier(bootstrap=True, class_weight='balanced', max_depth=10,
                     min_samples_leaf=15, n_jobs=-1, random_state=42,
                     verbose=True)

In [7]:
y_pred = np.array([])
for st in range(0, X_test.shape[0], 100000):
    y_pred = np.append(y_pred,(extra_tree.predict(X_test[st:min(st + 100000,X_test.shape[0])])))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   10.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   37.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.9s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   36.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.6s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   36.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    9.7s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   37.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   10

In [8]:
count_metrics(y_pred, y_test)

0.06122543352601156
0.03498944680682709


  _warn_prf(average, modifier, msg_start, len(result))


0.05315046376750011
0.0623537719284782


In [8]:
class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = self.y[indices]
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions

In [11]:
neigh = FaissKNeighbors(k=10)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [12]:
count_metrics(y_pred, y_test)

0.09570129667239494
0.08818080435646192


  _warn_prf(average, modifier, msg_start, len(result))


0.12224857218774755
0.09161534555579368
