### About the Dataset:

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:
           1: Fake news
           0: Real News

In [1]:
import re

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

import itertools
import time

In [3]:
from sklearn.experimental import enable_halving_search_cv

from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, RidgeClassifier, LogisticRegression, Perceptron 

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, StackingClassifier

from sklearn.svm import LinearSVC

from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, RocCurveDisplay

from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import joblib as jl

In [None]:
from sklearn import set_config

set_config(display = 'diagram')

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud, STOPWORDS

nltk.download('punkt')
nltk.download("stopwords")
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pepi_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pepi_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pepi_\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pepi_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
isot_true_news_dataset = pd.read_csv('data/additional_train_test_data/isot_news/True.csv')

In [None]:
isot_fake_news_dataset = pd.read_csv("data/additional_train_test_data/isot_news/Fake.csv")

In [None]:
isot_true_news_dataset["label"] = 0
isot_fake_news_dataset["label"] = 1

In [None]:
source_based_fake_news_classification = pd.read_csv("data/additional_train_test_data/source_based_fake_news_classification/news_articles.csv")

In [None]:
source_based_fake_news_classification.isna().sum()

In [None]:
source_based_fake_news_classification = source_based_fake_news_classification[["title", "text", "label"]]

In [None]:
welfake = pd.read_csv("data/additional_train_test_data/WELFake/WELFake_Dataset.csv")

In [None]:
welfake.head()

In [None]:
welfake = welfake[["title", "text", "label"]]

In [None]:
fake_real_dataset_fake = pd.read_csv("data/additional_train_test_data/fake-and-real/Fake.csv")

In [None]:
fake_real_dataset_fake.shape

In [None]:
fake_real_dataset_fake.head()

In [None]:
fake_real_dataset_fake["label"] = 1

In [None]:
fake_real_dataset_real = pd.read_csv("data/additional_train_test_data/fake-and-real/True.csv")

In [None]:
fake_real_dataset_real.shape

In [None]:
fake_real_dataset_real.head()

In [None]:
fake_real_dataset_real["label"] = 0

In [None]:
additional_data = pd.read_csv("data/fake_or_real_news.csv")


additional_data = additional_data.rename(columns = {"Unnamed: 0": "id"})

additional_data.label = additional_data.label.replace(["REAL", "FAKE"], [0, 1])

additional_data = additional_data[["title", "text", "label"]]

In [None]:
additional_data

In [None]:
data = pd.read_csv("data/additional_train_test_data/data.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data = data.drop(columns = ["URLs"])

In [None]:
data.columns = ["title", "text", "label"]

In [None]:
news_dataset = pd.concat([isot_true_news_dataset, isot_fake_news_dataset, source_based_fake_news_classification, welfake, fake_real_dataset_fake, fake_real_dataset_real, data])

In [None]:
news_dataset.head()

In [None]:
news_dataset.label = news_dataset.label.replace({"Fake": 1, "Real": 0})

In [None]:
news_dataset.label.value_counts()

In [None]:
news_dataset = news_dataset[~news_dataset.label.isna()]

In [None]:
news_dataset.isna().sum()

In [None]:
news_dataset = news_dataset[["title", "text", "label"]]

In [None]:
news_dataset = news_dataset.fillna('')

In [None]:
news_dataset.isna().sum()

In [None]:
news_dataset.shape

# EDA

In [None]:
news_dataset.shape

In [None]:
news_dataset.isnull().sum()

In [None]:
news_dataset.label.value_counts()

In [None]:
news_dataset = news_dataset.fillna(value = {"title": "", "text": ""})

In [None]:
news_dataset.isna().any()

In [None]:
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [None]:
len(news_dataset)

In [None]:
len(news_dataset[news_dataset.label == 1])

In [None]:
len(news_dataset[news_dataset.label == 0])

In [None]:
print(X,'\n\n\n\n',Y)

# Lemmatization

In [None]:
lemma = WordNetLemmatizer()

In [None]:
def stemming(data):
    
    stemmed_content = re.sub('[^a-zA-Z]',' ',data)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [lemma.lemmatize(word) for word in stemmed_content if not word in stopwords.words("english")]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [None]:
news_dataset['title'] = news_dataset['title'].apply(stemming)

# Visualizing

In [None]:
wordcloud = WordCloud(
                        background_color='black',
                        stopwords=STOPWORDS,
                        max_words=200, 
                        random_state=42).generate(str(news_dataset['title']))

plt.figure(figsize=(15,10))
plt.axis("off")
plt.title("Words frequented in text", fontsize=15)
plt.imshow(wordcloud.recolor(colormap= 'viridis' , random_state=42), alpha=0.98)
plt.show()

In [None]:
print(news_dataset['title'])

In [None]:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(news_dataset["title"], news_dataset["label"])

# Modelling

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    # Here we print the if confusion matrix is normalized
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # Here we include text that shows confusion matrix values
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
def model_report(model, X_test, y_test):
    pred = model.predict(X_test)

    print(accuracy_score(y_test, pred) * 100)
    print(classification_report(y_test, pred))

    cm = confusion_matrix(y_test, pred)

    plot_confusion_matrix(cm, classes=['Fake News', 'Real News'])

In [None]:
X = news_dataset['title'].values
Y = news_dataset['label'].values

In [None]:
print(X,'\n\n\n\n',Y)

In [None]:
# vectorizer = HashingVectorizer(n_features = 48000, ngram_range = (1, 3))
vectorizer = TfidfVectorizer(ngram_range = (1, 3))

X = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify=Y, random_state=42)

In [None]:
labels = additional_data.label

additional_data = additional_data.drop(columns=["label"])

# additional_data["title"] = additional_data["title"].apply(stemming)

# tfidf_v = HashingVectorizer(n_features = 48000, ngram_range = (1, 3))

In [None]:
tfidf_v = TfidfVectorizer(ngram_range = (1, 3))


X_additional = tfidf_v.fit_transform(additional_data["title"])

X_additional_train, X_additional_test, y_additional_train, y_additional_test = train_test_split(X_additional, labels, random_state=0, test_size = 0.1, stratify = labels)

Logreg1

In [None]:
model = LogisticRegression(C = 1e6, max_iter = 10000)
model.fit(X_train, Y_train)

# Accuracy Check

### Training Accuracy 

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

### Testing Accuracy 

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

### Model report

In [None]:
model_report(model, X_test, Y_test)

In [None]:
model_report(model, X_additional[:, :72698], labels)

# Model Prediction

In [None]:
X_new = X_test[1]
X_new_2 = X_test[0]

X_New_3 = X_test[22]

result = model.predict(X_new)
result_1 = model.predict(X_new_2)


def get_type(prediction):
    if (prediction[0]==0):
        return 'Real'
    else:
        return 'Fake'
    
print(f"The news is {get_type(result)}")
print(f"The news is {get_type(result_1)}")

In [None]:
model_report(model, X_test, Y_test)

In [None]:
print(Y_test[0])

Decision tree

In [None]:
model2 = DecisionTreeClassifier(random_state = 42)
model2.fit(X_train, Y_train)

In [None]:
X_test_prediction = model2.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score of the test data : ', test_data_accuracy)

In [None]:
X_train_prediction = model2.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

In [None]:
model2.score(X_test, Y_test)

In [None]:
pa = PassiveAggressiveClassifier(n_iter_no_change= 50, loss = "squared_hinge")
pa.fit(X_train, Y_train)

In [None]:
model_report(pa, X_test, Y_test)

In [None]:
fig = plt.figure(figsize=(100, 100))

ax = fig.gca()

plot_tree(model2, ax=ax)

plt.show()

In [None]:
plot_tree(model2, max_depth=1)

plt.show()

In [None]:
parameters_PassiveAggressive = {'C': (0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2.),
                                'fit_intercept': (True, False),
                                'n_iter_no_change': (1, 2, 3, 5, 8, 13, 21, 50),
                                'shuffle': (True, False),
                                'loss': ('hinge', 'squared_hinge'),
                                'warm_start': (True, False)}


In [None]:
pa_params = HalvingGridSearchCV(PassiveAggressiveClassifier(n_jobs = -1), parameters_PassiveAggressive, verbose=1)
pa_params.fit(X_train, Y_train)

In [None]:
pa_best = pa_params.best_estimator_

pa_best.fit(X_train, Y_train)

In [None]:
model_report(pa_best, X_test, Y_test)

In [None]:
model_report(pa, X_additional_test[:, :17128], y_additional_test[:17128])

In [None]:
tree_params = {'criterion': ['gini', 'entropy'], 'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15,
                                                               20, 30, 40, 50, 70, 90, 120, 150], 'splitter': ['best', 'random'], 'random_state': [42, 256]}

grid_search = HalvingGridSearchCV(DecisionTreeClassifier(), tree_params)

grid_search.fit(X_train, Y_train)


In [None]:
best_tree_params = grid_search.best_params_

In [None]:
best_tree_params

In [None]:
tree = DecisionTreeClassifier(max_depth=70, random_state=256,splitter='best')

tree.fit(X_train, Y_train)

In [None]:
model_report(tree, X_test, Y_test)

In [None]:
parameters_PassiveAggressive = {'C': (0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2.),
                                'fit_intercept': (True, False),
                                'n_iter_no_change': (1, 2, 3, 5, 8, 13, 21, 50),
                                'shuffle': (True, False),
                                'loss': ('hinge', 'squared_hinge'),
                                'warm_start': (True, False)}


In [None]:
pa_params = HalvingGridSearchCV(PassiveAggressiveClassifier(), parameters_PassiveAggressive, cv = 10)
pa_params.fit(X_train, Y_train)

In [None]:
pa_params.best_params_

In [None]:
clf = BaggingClassifier(
    base_estimator=PassiveAggressiveClassifier(n_iter_no_change=50, early_stopping=True, random_state = 42),
    n_estimators = 1,
    random_state = 42,
).fit(X_train, Y_train)

In [None]:
model_report(clf, X_additional_test[:, :17128], y_additional_test)

In [None]:
perc = Perceptron()

perc.fit(X_train, Y_train)

In [None]:
model_report(perc, X_test, Y_test)

In [None]:
sgd = SGDClassifier(loss = 'squared_hinge', random_state = 42, warm_start = True, alpha = .0000000000000001)

sgd.partial_fit(X_train, Y_train, classes = [0, 1])

sgd.partial_fit(X_additional_train[:, :17128], y_additional_train)

In [None]:
model_report(sgd, X_test, Y_test)

In [None]:
model_report(sgd, X_additional_test[:, :17128], y_additional_test)

In [None]:
bag_pa = BaggingClassifier(PassiveAggressiveClassifier(random_state = 42), random_state = 42)

bag_pa.fit(X_train, Y_train)

In [None]:
cal_pa = CalibratedClassifierCV(bag_pa)

cal_pa.fit(X_train, Y_train)

In [None]:
model_report(cal_pa, X_test, Y_test)

In [None]:
model_report(cal_pa, X_additional_test[:, :17128], y_additional_test)

In [None]:
clf1 = BaggingClassifier(PassiveAggressiveClassifier(random_state = 42)).fit(X_train, Y_train)
clf2 = BaggingClassifier(LogisticRegression(random_state = 42)).fit(X_train, Y_train)
clf3 = BaggingClassifier(DecisionTreeClassifier(random_state = 42)).fit(X_train, Y_train)
clf4 = BaggingClassifier(RidgeClassifier(random_state = 42)).fit(X_train, Y_train)
clf5 = BaggingClassifier(SGDClassifier(random_state = 42)).fit(X_train, Y_train)
clf6 = BaggingClassifier(Perceptron(random_state = 42)).fit(X_train, Y_train)
clf7 = BaggingClassifier(RandomForestClassifier(n_estimators = 50, random_state = 42)).fit(X_train, Y_train)

final_clf = BaggingClassifier(AdaBoostClassifier(random_state = 42)).fit(X_train, Y_train)

In [None]:
stack = StackingClassifier(
    estimators = [
        ('pa', clf1),
        ('lgr', clf2),
        ('tree', clf3),
        ('ridge', clf4),
        ('sgd', clf5),
        ('perceptron', clf6),
        ('f', clf7),
    ],
    final_estimator = final_clf,
    stack_method = 'predict'
)


stack.fit(X_train, Y_train)

In [None]:
model_report(stack, X_test, Y_test)

In [None]:
model_report(stack, X_additional_test[:, :17128], y_additional_test)

In [None]:
vote = VotingClassifier(
    estimators = [
        ('lgr', clf2),
        ('tree', clf3),
        ('sgd', clf5),
        ('perceptron', clf6),
        ('f', clf7),
    ],
    voting = 'soft'
)

vote.fit(X_train, Y_train)

In [None]:
model_report(vote, X_test, Y_test)

In [None]:
model_report(vote, X_additional_test[:, :17128], y_additional_test)

In [None]:
ada = BaggingClassifier(
    AdaBoostClassifier(
        LogisticRegression(),
        random_state = 42,
        n_estimators = 100,
        learning_rate = 2
    ),
    random_state = 42
).fit(X_train, Y_train)

In [None]:
ada.score(X_test, Y_test)

In [None]:
model_report(ada, X_additional_test[:, :17128], y_additional_test)

Ridge Classifier CV Test

In [None]:
ridge = RidgeClassifier(alpha = 1000)

ridge.fit(X_train, Y_train)

In [None]:
model_report(ridge, X_test, Y_test)

In [None]:
model2 = LogisticRegression(penalty = 'elasticnet', C = 10e6, solver = 'saga', l1_ratio = 0.3, max_iter = 1000)

model2.fit(X_train, Y_train)

In [None]:
model_report(model2, X_additional_test[:, :17128], y_additional_test)

Multinomial NB

In [None]:
param_grid = {'alpha': [1e-10, 1e-8, 1e-6, 1e-4, 1e-2, 1e-1, 1, 10, 100, 1e4]}

In [None]:
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid = param_grid, verbose = 3)

In [None]:
grid_search_nb.fit(X_train, Y_train)

In [None]:
grid_search_nb.best_params_

In [None]:
nb_tuned = MultinomialNB(alpha = 0.1)

In [None]:
nb_tuned.fit(X_train, Y_train)

In [None]:
model_report(nb_tuned, X_test, Y_test)

In [None]:
model_report(nb_tuned, X_additional_test, y_additional_test)

In [None]:
neighbours_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [10, 15, 20, 25, 30, 35, 40]
}

search = HalvingGridSearchCV(KNeighborsClassifier(n_jobs = -1), param_grid = neighbours_grid, verbose = 3)

In [None]:
search.fit(X_train, Y_train)

In [None]:
search.best_params_

In [None]:
clf = KNeighborsClassifier(n_neighbors = 9, weights = 'distance')

In [None]:
clf.fit(X_train, Y_train)

In [None]:
model_report(clf, X_test, Y_test)

In [None]:
model_report(clf, X_additional_test, y_additional_test)

In [None]:
model_report(clf, X_additional, labels)

In [None]:
graph = clf.kneighbors_graph().toarray()

In [None]:
graph.shape

## LinearSVC

In [None]:
linsvc = LinearSVC(random_state = 42, tol = 1e-9)

In [None]:
linsvc_grid = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'C': [1e-4, 1e-2, 1e-1, 1, 10, 1e2, 1e4],
    'max_iter': [1000, 1500, 2000, 2500, 3000]
}

linear_svc_tuner = HalvingGridSearchCV(
    LinearSVC(random_state = 42),
    linsvc_grid,
    random_state = 42,
    verbose = 3,
    error_score = 0,
    scoring = 'roc_auc'
)

In [None]:
linear_svc_tuner.fit(X_train, Y_train)

In [None]:
linear_svc_tuner.best_params_

In [None]:
linsvc.fit(X_train, Y_train)

In [None]:
model_report(linsvc, X_additional, labels)

## Best model

In [None]:
linsvc_with_best_params = LinearSVC(
    C = 1,
    class_weight = 'balanced',
    fit_intercept = False,
    loss = 'squared_hinge',
    max_iter = 2000,
    penalty = 'l2',
    random_state = 42
)

In [None]:
linsvc_with_best_params.fit(X_train, Y_train)

In [None]:
linsvc_with_best_params.score(X_test, Y_test)

In [None]:
RocCurveDisplay.from_estimator(linsvc_with_best_params, X_test, Y_test)

In [None]:
linsvc_with_best_params.score(X_additional, labels)

In [None]:
model_report(linsvc_with_best_params, X_additional, labels)

### Creating pipeline for fake news classification

In [None]:
class TextNormalizer(BaseEstimator, TransformerMixin):
    """
    Does lemmatization and stopwords removal.
    """
    def __init__(self):
        self.stopwords = stopwords.words("english")
    
    def normalize(self, document):
        lemma = WordNetLemmatizer()
        stemmed_content = re.sub('[^a-zA-Z]',' ', document)
        stemmed_content = stemmed_content.lower()
        stemmed_content = stemmed_content.strip()
        stemmed_content = stemmed_content.split()
        stemmed_content = [lemma.lemmatize(word) for word in stemmed_content if not word in self.stopwords]
        stemmed_content = ' '.join(stemmed_content)

        return stemmed_content

    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        result = []
        for document in documents:
            result.append(self.normalize(document))
        
        return result

In [None]:
fake_news_pipeline = Pipeline(
    steps = [
        ('lemmatization', TextNormalizer()),
        ('hashing', HashingVectorizer(n_features = 48000, ngram_range = (1, 3))),
        ('classifier', LinearSVC(class_weight = 'balanced', fit_intercept = False, max_iter = 2000, random_state = 42))
    ]
)

In [None]:
fake_news_pipeline.fit(X_text_train, y_text_train)

In [None]:
sample = news_dataset.sample(50, random_state = 42)

titles = sample["title"]

labels = sample["label"]

predictions = fake_news_pipeline.predict(titles)

model_report(fake_news_pipeline, titles, labels)

In [None]:
model_report(fake_news_pipeline, titles, labels)

### Perceptron


The Perceptron is a simple classification algorithm suitable for large scale learning. By default:

* It does not require a learning rate.

* It is not regularized (penalized).

* It updates its model only on mistakes.

The last characteristic implies that the Perceptron is slightly faster to train than SGD with the hinge loss and that the resulting models are sparser.

In [None]:
perc_grid = {
    'penalty': [None, 'l1', 'l2', 'elasticnet'],
    'alpha': [1e-2, 1e-1, 1, 10, 100],
    'fit_intercept': [True, False],
    'shuffle': [True, False],
    'class_weight': [None, 'balanced']
}

In [None]:
perc_search = HalvingRandomSearchCV(Perceptron(random_state = 42), perc_grid, error_score = 0, scoring = 'f1', verbose = 3, random_state = 42)

In [None]:
perc_search.fit(X_train_more, y_train_more)

In [None]:
perc_search.best_params_

In [None]:
perc_best = Perceptron(**perc_search.best_params_, random_state = 42)

In [None]:
perc_pipe = Pipeline(
    steps = [
        ('lemmatization', TextNormalizer()),
        ('hashing', HashingVectorizer(n_features = HASHING_N_FEATURES, ngram_range = (1, 5))),
        ('selection', RFE(Perceptron(random_state = 42), n_features_to_select = FEATURE_SELECTION_MAX_FEATURES, step = 10000)),
        ('perceptron', perc_best)
    ],
    verbose = 3
)

In [None]:
perc_pipe.fit(X_text_train, y_text_train)

In [None]:
perc_pipe.score(X_additional_text_test, y_additional_text_test)

In [None]:
perc_pipe.score(X_text_test, y_text_test)

In [None]:
model_report(perc_pipe, X_text_test, y_text_test)

### SGD

In [None]:
sgd = SGDClassifier(
    loss = 'modified_huber',
    penalty = 'none',
    learning_rate = 'adaptive',
    eta0 = 100,
    random_state = 42,
    early_stopping = True
)

In [None]:
sgd_pipe = Pipeline(
    steps = [
        ('lemmatization', TextNormalizer()),
        ('hashing', HashingVectorizer(n_features = HASHING_N_FEATURES, ngram_range = (1, 5), norm = None)),
        ('selection', RFE(sgd, n_features_to_select = FEATURE_SELECTION_MAX_FEATURES, step = 10000)),
        ('sgd', sgd)
    ],
    memory = 'cache',
    verbose = 3
)

In [None]:
sgd_pipe.fit(X_text_train, y_text_train)

In [None]:
sgd_pipe.score(X_additional_text_test, y_additional_text_test)

### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier(
    verbose = 3,
    hidden_layer_sizes = (10),
    random_state = 42,
    nesterovs_momentum = True,
    max_iter = 10
)

In [None]:
mlp_pipe = Pipeline(
    steps = [
        ('lemmatization', TextNormalizer()),
        ('hashing', HashingVectorizer(n_features = 100000, ngram_range = (1, 3))),
#         ('selection', RFE(Perceptron(random_state = 42), n_features_to_select = FEATURE_SELECTION_MAX_FEATURES, step = 10000)),
        ('mlp', mlp)
    ],
    memory = 'cache',
    verbose = 3
)

In [None]:
mlp_pipe.fit(X_text_train, y_text_train)

In [None]:
mlp_pipe.score(additional_data["title"], labels)

### Comprasion of Perceptron, SGD and MLP

In [None]:
from sklearn.metrics import f1_score, accuracy_score, log_loss

In [None]:
print('Perc f1: ')
print(f1_score(y_additional_text_test, perc_pipe.predict(X_additional_text_test)))
print('SGD f1: ')
print(f1_score(y_additional_text_test, sgd_pipe.predict(X_additional_text_test)))
print('MLP f1: ')
print(f1_score(y_additional_text_test, mlp_pipe.predict(X_additional_text_test)))
print('')
print('Perc Acc: ')
print(accuracy_score(y_additional_text_test, perc_pipe.predict(X_additional_text_test)))
print('SGD Acc: ')
print(accuracy_score(y_additional_text_test, sgd_pipe.predict(X_additional_text_test)))
print('MLP Acc: ')
print(accuracy_score(y_additional_text_test, mlp_pipe.predict(X_additional_text_test)))
print('')
print('Perc LogLoss: ')
print(log_loss(y_additional_text_test, perc_pipe.predict(X_additional_text_test)))
print('SGD LogLoss: ')
print(log_loss(y_additional_text_test, sgd_pipe.predict(X_additional_text_test)))
print('MLP LogLoss: ')
print(log_loss(y_additional_text_test, mlp_pipe.predict(X_additional_text_test)))

## LinearSVC and Naive Bayes with CountVectorizer

### LinearSVC

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [None]:
linsvc_tfidf_pipe = Pipeline(
    steps = [
        ('normalizer', TextNormalizer()),
        ('count_vect', CountVectorizer(ngram_range = (1, 5))),
        ('tf-idf', TfidfTransformer(use_idf = False, sublinear_tf = True)),
#         ('model', MultinomialNB(alpha = 0.001))
        ('model', LinearSVC(random_state = 42, class_weight = 'balanced'))
    ],
    verbose = 3
)

In [None]:
linsvc_tfidf_pipe.fit(X_text_train, y_text_train)

In [None]:
linsvc_tfidf_pipe.score(X_text_test, y_text_test)

In [None]:
linsvc_tfidf_pipe.score(X_additional_text_test, y_additional_text_test)

### Naive Bayes

In [None]:
nb_tfidf_pipe = Pipeline(
    steps = [
        ('normalizer', TextNormalizer()),
        ('count_vect', CountVectorizer(ngram_range = (1, 5))),
        ('tf-idf', TfidfTransformer(use_idf = False, sublinear_tf = True)),
        ('model', MultinomialNB(alpha = 0.001))
    ],
    verbose = 3
)

In [None]:
nb_tfidf_pipe.fit(X_text_train, y_text_train)

In [None]:
nb_tfidf_pipe.score(X_additional_text_test, y_additional_text_test)