In [1]:
import tensorflow
from tensorflow import keras
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils



In [2]:
from preprocessing import *

PATH_LEFT = '../data/data_left.csv'
PATH_CENTER = '../data/data_center.csv'
PATH_RIGHT = '../data/data_right.csv'

In [3]:
df_left = load_and_process(PATH_LEFT)
print(df_left.head())

df_center = load_and_process(PATH_CENTER)
print(df_center.head())

df_right = load_and_process(PATH_RIGHT)
print(df_right.head())

                                                text bias_rating
0  Trump's Policy Of Forcing Asylum-Seekers To Wa...        left
1  Fourth of July parties will cost more this yea...        left
2  Mother’s Day Can Be Painful. It Can Also Recon...        left
3  'The crisis we face is real': Blinken on why B...        left
4  John Kerry to give major speech on Iran nuclea...        left
                                                text bias_rating
0  U.S. lawmakers urge Google to fix abortion sea...      center
1  Harvey Weinstein accusers reach tentative $25 ...      center
2  De Blasio: ‘Political Debate’ Can Wait Until A...      center
3  'Finally, infrastructure week': White House ce...      center
4  ‘confusing’ small-business loan program needs ...      center
                                                text bias_rating
0  Tragedy strikes hearts, changes Ã¢â‚¬Â¨minds o...       right
1  Ex-Space Force commander: DOD videos claimed w...       right
2  US attorney handling H

In [4]:
df = pd.concat([df_left, df_center, df_right], axis = 0)
df['bias_rating'] = df['bias_rating'].replace(['left', 'center', 'right'], [int(0), int(1), int(2)])

df.shape

  df['bias_rating'] = df['bias_rating'].replace(['left', 'center', 'right'], [int(0), int(1), int(2)])


(12000, 2)

In [5]:
tagged_documents = []
for _, row in df.iterrows():
    words = preprocess_text(str(row['text']))
    tags = [row['bias_rating']]
    tagged_documents.append(TaggedDocument(words=words, tags=tags))

max_length = 0
for docs in tagged_documents:
    if (len(docs[0]) > max_length):
        max_length = len(docs[0])
max_length

91

In [6]:
import multiprocessing

cores = multiprocessing.cpu_count()
model = Doc2Vec(dm = 0, vector_size = max_length, negative = 5, hs = 0, sample = 0, min_count = 2, workers = cores)

In [7]:
df['text'] = tagged_documents

model.build_vocab(df['text'].values)
model.train(utils.shuffle(df['text'].values), total_examples = len(df['text'].values), epochs = 30)

In [8]:
tagged_documents = df['text'].dropna()
features, labels = vec_for_learning(model, tagged_documents)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.30, random_state = 42)

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.9069444444444444


In [11]:
from sklearn.model_selection import GridSearchCV

# Selecting an array of hyperparameters for hyperparameter tuning using GridSearchCV() function
parameters = {
  'C' : [0.01, 0.1, 1, 10],
  'kernel' : ['linear', 'rbf'],
  'gamma' : [0.01, 0.1, 1],
}

# Setting up the hyperparameter tunings
svm_cv_model = GridSearchCV( estimator = SVC(),
                         param_grid = parameters,
                         cv = 3,
                         verbose = 2
)

# Running GridSearchCV() on training set
svm_cv_model.fit(X_train, y_train)

# Getting prediction on testing set
y_pred = svm_cv_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM(with 3-CV) classifier is: " + str(round(100 * accuracy, 3)) + "%.")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.5s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.5s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.4s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   3.3s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   3.5s
[CV] END ......................C=0.01, gamma=0.1

In [12]:
from joblib import dump

dump(model, '../app/tokenizer.joblib')
dump(svm_cv_model, '../app/bias_predictor.joblib')

['../app/bias_predictor.joblib']