In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

import glob
import os
import zipfile
from xml.etree import ElementTree as ET

# Text proccessing
import re

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import stopwords



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Text_Loading

In [None]:
with zipfile.ZipFile('./datasets/pan20-author-profiling-training-2020-02-23.zip', 'r') as zip_ref:
  zip_ref.extractall('./datasets')

In [None]:
data_path = './datasets/pan20-author-profiling-training-2020-02-23/'
lang = 'en'

In [None]:
labels_path = data_path + lang + '/truth.txt'
true_values = {}

file = open(labels_path)
for line in file:
    line_parsed = line.strip().split(':::')
    true_values[line_parsed[0]] = line_parsed[1]

# Text_Proccessing

In [None]:
def get_tweets(file):
  """
  Arguments:
    file - file to proccess
  Returns:
    array of tweets
  """
  tweets = []
  parsed_file = ET.parse(file)
  documents = parsed_file.iter('document')

  for doc in documents:
    tweets.append(clean_text(doc.text))

  return tweets

In [None]:
def clean_text(text):
  """
  Arguments:
    text: text to be parsed
  Returns
    text without special characters
  """
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"sp", " ", text)
  # text = text.lower()
  text = re.sub(r"[^a-zA-Z ]", " ", text) # remove everything expect a-z
  text = re.sub(r"\b\w{1,1}\b", " ",text) # remove everything of length 1
  text = " ".join([x for x in text.split()])
  return text

In [None]:
def get_tweets_representation(tweets):
  """
  Arguments:
    tweets: array of tweets
  Returns:
    array with string concatenation of tweets
  """
  return np.array(np.array2string(np.array(tweets)))

In [None]:
def get_tweets_pos_tags(text):
  """
  Arguments:
    tweets: text
  Returns:
    dictionary pos
  """
  return pos_tag(word_tokenize(text))

In [None]:
def get_tweets_features(tweets):
  """
  Arguments:
    tweets - array of tweets
  Returns:
    mean tweet size
  """
  tweet_lengths = [len(tweet) for tweet in tweets]
  mean_tweet_length = np.mean(tweet_lengths)
  std_tweet_length = np.std(tweet_lengths)
  return [mean_tweet_length, std_tweet_length]

In [None]:
X = []
X_tweets = []
X_pos = []
X_extra = []
y = []

for file in glob.glob(data_path + lang + "/*.xml"):
  user_code = file.split('/')[-1][:-4]
  user_tweets = get_tweets(file)
  user_tweets_representation = get_tweets_representation(user_tweets)
  user_tweets_extra_features = get_tweets_features(user_tweets)

  X.append(user_tweets_representation)
  X_tweets.append(user_tweets)
  X_extra.append(user_tweets_extra_features)
  y.append(true_values[user_code])

X = np.array(X)
X_tweets = np.array(X_tweets)
X_extra = np.array(X_extra)
y = np.array(y)
y = y.astype(np.float32)
print("X shape: {} | X_extra shape: {} | y shape: {}".format(X.shape, X_extra.shape, y.shape))

X shape: (300,) | X_extra shape: (300, 2) | y shape: (300,)


In [None]:
X[:2]

array(["['Justin Trudeau bows and shakes hands with Iranian foreign minister in photo op True North URL'\n 'Can Moscow finish Nord Stream gas pipeline de ite US sanctions Never say never says Gazprom RT Business News URL'\n 'KNIGHT Pipeline protests have nothing to do with supporting the Wet suwet en people True North URL'\n 'From clothes to condoms Coronavirus is threatening global consumption in ways you never knew were possible RT Wo URL'\n 'Amazon has job listings maybe its most ever URL'\n 'NASA confirms SpaceX will become the first private company to send astronauts to the ace station URL'\n 'More than former DOJ officials call on Attorney General Barr to resign'\n 'FCA plant closure China supply issue causes Fiat Chrysler Serbia plant closure Auto News ET Auto URL'\n 'The cost of dying How ike in cremation rates is changing the funeral industry'\n 'Fossil fuel industry hosts EU presidency URL'\n 'URL NASA Space Exploration and Astronomy News URL'\n 'Half million Romanians want t

## Sentiment_Polarity


In [None]:
sid = SentimentIntensityAnalyzer()
def compute_sentiment_features(tweets):
  """
  Arguments:
    tweets - array of tweets
  """
  polarity = []
  for tweet in tweets:
    scores = sid.polarity_scores(tweet)
    polarity.append([scores['neg']])
  return np.mean(polarity, axis=0), np.std(polarity, axis=0)

In [None]:
X_polarity = np.array([compute_sentiment_features(x_tweet) for x_tweet in X_tweets]).reshape(300, -1)

In [None]:
X_polarity.shape

(300, 2)

## Pos_Tagging

In [None]:
def get_pos_features(pos_tags):
  """
  Arguments:
    pos_tags - array of pos tagged words
  Returns:
    PRP counts
  """
  count_prp = 0
  for pos_tag in pos_tags:
    tag = pos_tag[1]
    if tag == 'PRP':
      count_prp += 1

  return np.array([count_prp])

In [None]:
X_pos = np.array([np.array(get_tweets_pos_tags(x)) for x in X])

In [None]:
X_pos_counts = np.array([get_pos_features(x_pos) for x_pos in X_pos])

In [None]:
X_pos_counts[:10]

array([[ 7],
       [ 7],
       [25],
       [63],
       [20],
       [30],
       [ 2],
       [36],
       [ 3],
       [45]])

# Models

In [None]:
def get_tfidf_vectorizer(language='en'):
  if language == 'en':
    language_stopwords = 'english'
    max_df = .9
    n_gram_range = (1,1)
  else:
    language_stopwords = 'spanish'
    max_df = .7
    n_gram_range = (1,2)

  return TfidfVectorizer(max_features=500, min_df=5, max_df=max_df, ngram_range=n_gram_range, stop_words=stopwords.words(language_stopwords))

In [None]:
def get_random_forest_classifier_model(n_estimators=500):
  rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
  return rf_model

In [None]:
def get_gradient_boosting_classifier_model(n_estimators=500):
  gb_model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
  return gb_model

In [None]:
def get_dense_model(input_shape):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape, )))
  model.add(tf.keras.layers.Dropout(.2))
  model.add(tf.keras.layers.Dense(32, activation='relu'))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
@tf.function
def map_output(x):
  if x > .5:
    1.0
  else:
    0.0

def get_doc2vec_model():
  embed_model = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'
  hub_layer = hub.KerasLayer(embed_model, output_shape=(20), dtype=tf.string, input_shape=(), trainable=True)
  model = tf.keras.Sequential()
  model.add(hub_layer)
  model.add(tf.keras.layers.Dense(64, activation='relu'))
  model.add(tf.keras.layers.Dropout(.3))
  model.add(tf.keras.layers.Dense(32, activation='relu'))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

# Manual_Encoding

In [None]:
X_manual = np.hstack((X_pos_counts, X_extra, X_polarity)).astype(np.float32)

In [None]:
X_manual[:2]

array([[ 7.        , 71.93      , 23.160852  ,  0.07948   ,  0.1348088 ],
       [ 7.        , 65.05      , 10.464583  ,  0.08328   ,  0.12664218]],
      dtype=float32)

## Manual_Encoding_Gradient_Boosting

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
for train_split, test_split in skf.split(X_manual, y):
  X_train, X_test, y_train, y_test = X_manual[train_split], X_manual[test_split], y[train_split], y[test_split]

  gb_classifier_model = get_gradient_boosting_classifier_model()
  gb_classifier_model.fit(X_train, y_train)

  y_pred = gb_classifier_model.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))



Accuracy: 0.6666666666666666%
Accuracy: 0.65%
Accuracy: 0.5833333333333334%
Accuracy: 0.6%
Accuracy: 0.6833333333333333%
Mean Accuracy: 0.6366666666666667%


## Manual_Encoding_Random_Forest

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
for train_split, test_split in skf.split(X_manual, y):
  X_train, X_test, y_train, y_test = X_manual[train_split], X_manual[test_split], y[train_split], y[test_split]

  rf_classifier_model = get_random_forest_classifier_model()
  rf_classifier_model.fit(X_train, y_train)

  y_pred = rf_classifier_model.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))



Accuracy: 0.6333333333333333%
Accuracy: 0.6333333333333333%
Accuracy: 0.6%
Accuracy: 0.5833333333333334%
Accuracy: 0.7%
Mean Accuracy: 0.6300000000000001%


# TFIDF_Encoding

In [None]:
def tfidf_features(text, training=True):
  """
  Arguments:
    text - string
    training - flag
  Returns:
    tfidf feature matrix
  """
  if training:
      x = tfidf.fit_transform(text)
  else:
      x = tfidf.transform(text)
  x = x.astype('float32')
  return x

## CHI2_Feature_Selection
This section is not being used, as it doesn't contribute to the accuracy of the model.

In [None]:
def chi2_features_select(X_train, X_test, y_train, k=10):
  """
  Arguments:
    X - data
    y - labels
    n - number of features to select
  Returns:
    dataset containing n best features
  """
  ch2 = SelectKBest(score_func=chi2, k=k)
  X_train = ch2.fit_transform(X_train, y_train)
  X_test = ch2.transform(X_test)
  return X_train, X_test

In [None]:
def tfidf_features_best_plot(X, y, k=10):
  """
  Arguments:
    X_train - datasets
    y_train - labels
    k - number of features to plot
  Returns:
    plots data
  """

  chi2score = chi2(X, y)[0]
  plt.figure(figsize=(15,10))
  wscores = list(zip(tfidf.get_feature_names(), chi2score))
  wchi2 = sorted(wscores, key=lambda x:x[1])
  topchi2 = list(zip(*wchi2[-k:]))
  x = range(len(topchi2[1]))
  labels = topchi2[0]
  plt.barh(x,topchi2[1], align='center', alpha=0.2)
  plt.plot(topchi2[1], x, '-o', markersize=5, alpha=0.8)
  plt.yticks(x, labels)
  plt.xlabel('$\chi^2$')

## TFIDF_Gradient_Boosting
- gives the best results for TFIDF proccessing

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()

  gb_classifier_model = get_gradient_boosting_classifier_model()
  gb_classifier_model.fit(X_train_feat, y_train)
  y_pred = gb_classifier_model.predict(X_test_feat)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))



Accuracy: 0.6833333333333333%
Accuracy: 0.7166666666666667%
Accuracy: 0.7166666666666667%
Accuracy: 0.7666666666666667%
Accuracy: 0.8166666666666667%
Mean Accuracy: 0.74%


## TFIDF_Random_Forest

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()

  rf_classifier_model = get_random_forest_classifier_model()
  rf_classifier_model.fit(X_train_feat, y_train)
  y_pred = rf_classifier_model.predict(X_test_feat)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))



Accuracy: 0.6833333333333333%
Accuracy: 0.6833333333333333%
Accuracy: 0.6666666666666666%
Accuracy: 0.7%
Accuracy: 0.7166666666666667%
Mean Accuracy: 0.6900000000000001%


## TFIDF_Dense

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()

  dense = get_dense_model(X_train_feat.shape[1])
  dense.fit(X_train_feat, y_train, batch_size=10, epochs=50, verbose=None)
  y_pred = dense.predict_classes(X_test_feat)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))



Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Accuracy: 0.6166666666666667%
Accuracy: 0.65%
Accuracy: 0.7833333333333333%
Accuracy: 0.75%
Accuracy: 0.6666666666666666%
Mean Accuracy: 0.6933333333333332%


## TFIDF_Polarity_Feature_Fusion_Random_Forest
- combining polarity with TFIDF, slightly improves the accuracy.
- we might want to discard this section also

In [None]:
skf = StratifiedKFold(5, random_state=42)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]
  # X_pos_counts_train, X_pos_counts_test = X_pos_counts[train_split], X_pos_counts[test_split]
  X_polarity_norm = X_polarity / np.amax(X_polarity)
  X_polarity_train, X_polarity_test = X_polarity_norm[train_split], X_polarity_norm[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()

  # Add POS Features
  # X_train_feat = np.hstack((X_train_feat, X_pos_counts_train))
  # X_test_feat = np.hstack((X_test_feat, X_pos_counts_test))

  # Add Extra Features
  # X_train_feat = np.hstack((X_train_feat, X_extra_train))
  # X_test_feat = np.hstack((X_test_feat, X_extra_test))

  # Add Polarity Features
  X_train_feat = np.hstack((X_train_feat, X_polarity_train))
  X_test_feat = np.hstack((X_test_feat, X_polarity_test))

  rf_classifier_model = get_random_forest_classifier_model()
  rf_classifier_model.fit(X_train_feat, y_train)
  y_pred = rf_classifier_model.predict(X_test_feat)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}".format(accuracy))
print("Mean Accuracy: {}".format(np.mean(history)))



Accuracy: 0.7666666666666667
Accuracy: 0.6666666666666666
Accuracy: 0.6833333333333333
Accuracy: 0.7166666666666667
Accuracy: 0.7333333333333333
Mean Accuracy: 0.7133333333333334


## TFIDF_Polarity_Feature_Fusion_Gradient_Boosting

In [None]:
skf = StratifiedKFold(5, shuffle=False)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]
  # X_pos_counts_train, X_pos_counts_test = X_pos_counts[train_split], X_pos_counts[test_split]
  X_polarity_norm = X_polarity / np.amax(X_polarity)
  X_polarity_train, X_polarity_test = X_polarity_norm[train_split], X_polarity_norm[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()

  # Add Polarity Features
  X_train_feat = np.hstack((X_train_feat, X_polarity_train))
  X_test_feat = np.hstack((X_test_feat, X_polarity_test))

  rf_classifier_model = get_gradient_boosting_classifier_model()
  rf_classifier_model.fit(X_train_feat, y_train)
  y_pred = rf_classifier_model.predict(X_test_feat)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}".format(accuracy))
print("Mean Accuracy: {}".format(np.mean(history)))

Accuracy: 0.7
Accuracy: 0.7
Accuracy: 0.7
Accuracy: 0.8166666666666667
Accuracy: 0.8166666666666667
Mean Accuracy: 0.7466666666666665


# Doc2Vec_Embedding

In [None]:
skf = StratifiedKFold(5, shuffle=False)
history = []
evaluate = []
for train_split, test_split in skf.split(X, y):
  X_train, X_test, y_train, y_test = X[train_split], X[test_split], y[train_split], y[test_split]
  model = get_doc2vec_model()
  h = model.fit(X_train, y_train, batch_size=10, epochs=15, verbose=None)
  y_pred = model.predict_classes(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print('Accuracy: {}'.format(accuracy))
print('Mean Accuracy: {}'.format(np.mean(history)))

Accuracy: 0.5666666666666667
Accuracy: 0.7166666666666667
Accuracy: 0.65
Accuracy: 0.7
Accuracy: 0.6166666666666667
Mean Accuracy: 0.6499999999999999


In [None]:
for e in evaluate:
  print("Test loss: {} | Test accuracy: {}".format(e[0], e[1]))
  print(e)
print(np.mean(evaluate, axis=0))

nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


# Late Feature Fusion

In [None]:
def compute_proba_to_pred(y_1, y_2):
  """
  Arguments:
    y_1 - 1st model probabilities for classes
    y_2 - 2nd model probabilities for classes
  Returns:
    0, 1 classes according to probabilities
  """
  y_merged = (y_1 + y_2) / 2
  return np.argmax(y_merged, axis=1)

## 2 x Random Forest

In [None]:
skf = StratifiedKFold(5, shuffle=False)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  y_train, y_test = y[train_split], y[test_split]
  X_train, X_test = X[train_split], X[test_split]
  X_train_manual, X_test_manual = X_manual[train_split], X_manual[test_split]

  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()


  rf_classifier_model_tfidf = get_random_forest_classifier_model()
  rf_classifier_model_manual = get_random_forest_classifier_model()

  rf_classifier_model_tfidf.fit(X_train_feat, y_train)
  rf_classifier_model_manual.fit(X_train_manual, y_train)

  y_pred_tfidf = rf_classifier_model_tfidf.predict_proba(X_test_feat)
  y_pred_manual = rf_classifier_model_manual.predict_proba(X_test_manual)

  y_pred = compute_proba_to_pred(y_pred_tfidf, y_pred_manual)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))

Accuracy: 0.7166666666666667%
Accuracy: 0.6%
Accuracy: 0.6666666666666666%
Accuracy: 0.7%
Accuracy: 0.65%
Mean Accuracy: 0.6666666666666667%


## 2 x Gradient Boosting

In [None]:
skf = StratifiedKFold(5, shuffle=False)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  y_train, y_test = y[train_split], y[test_split]
  X_train, X_test = X[train_split], X[test_split]
  X_train_manual, X_test_manual = X_manual[train_split], X_manual[test_split]

  # tfidf = TfidfVectorizer(max_features=500, min_df=5, max_df=.7, ngram_range=(1,1), stop_words=stopwords.words('english'))
  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()


  gb_classifier_model_tfidf = get_gradient_boosting_classifier_model()
  gb_classifier_model_manual = get_gradient_boosting_classifier_model()

  gb_classifier_model_tfidf.fit(X_train_feat, y_train)
  gb_classifier_model_manual.fit(X_train_manual, y_train)

  y_pred_tfidf = gb_classifier_model_tfidf.predict_proba(X_test_feat)
  y_pred_manual = gb_classifier_model_manual.predict_proba(X_test_manual)

  y_pred = compute_proba_to_pred(y_pred_tfidf, y_pred_manual)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))

Accuracy: 0.75%
Accuracy: 0.7%
Accuracy: 0.6%
Accuracy: 0.7166666666666667%
Accuracy: 0.7333333333333333%
Mean Accuracy: 0.7%


## Gradient Boosting & Random Forest

In [None]:
skf = StratifiedKFold(5, shuffle=False)
history = []
n_best_words= 50
for train_split, test_split in skf.split(X, y):
  y_train, y_test = y[train_split], y[test_split]
  X_train, X_test = X[train_split], X[test_split]
  X_train_manual, X_test_manual = X_manual[train_split], X_manual[test_split]

  # tfidf = TfidfVectorizer(max_features=500, min_df=5, max_df=.7, ngram_range=(1,1), stop_words=stopwords.words('english'))
  tfidf = get_tfidf_vectorizer(lang)
  X_train_feat = tfidf_features(X_train).todense()
  X_test_feat = tfidf_features(X_test, training=False).todense()


  gb_classifier_model_tfidf = get_gradient_boosting_classifier_model()
  rf_classifier_model_manual = get_random_forest_classifier_model()

  gb_classifier_model_tfidf.fit(X_train_feat, y_train)
  rf_classifier_model_manual.fit(X_train_manual, y_train)

  y_pred_tfidf = gb_classifier_model_tfidf.predict_proba(X_test_feat)
  y_pred_manual = rf_classifier_model_manual.predict_proba(X_test_manual)

  y_pred = compute_proba_to_pred(y_pred_tfidf, y_pred_manual)

  accuracy = accuracy_score(y_test, y_pred)
  history.append(accuracy)

  print("Accuracy: {}%".format(accuracy))
print("Mean Accuracy: {}%".format(np.mean(history)))

Accuracy: 0.7166666666666667%
Accuracy: 0.75%
Accuracy: 0.6833333333333333%
Accuracy: 0.7833333333333333%
Accuracy: 0.8333333333333334%
Mean Accuracy: 0.7533333333333334%


# Dummy_Classifier

In [None]:
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X_extra[train_index], X_extra[test_index]
  y_train, y_test = y[train_index], y[test_index]

  dummy_clf = DummyClassifier(strategy="most_frequent")
  dummy_clf.fit(X_train, y_train)
  y_pred = dummy_clf.predict(X_test)
  print("Dummy_Classifier:", accuracy_score(y_test,y_pred))


Dummy_Classifier: 0.5
Dummy_Classifier: 0.5
Dummy_Classifier: 0.5
Dummy_Classifier: 0.5
Dummy_Classifier: 0.5
