In [191]:
import tensorflow
from tensorflow import keras
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils

In [192]:
from preprocessing import *

PATH_LEFT = '../data/data_left.csv'
PATH_CENTER = '../data/data_center.csv'
PATH_RIGHT = '../data/data_right.csv'

In [193]:
df_left = load_and_process(PATH_LEFT)
print(df_left.head())

df_center = load_and_process(PATH_CENTER)
print(df_center.head())

df_right = load_and_process(PATH_RIGHT)
print(df_right.head())

                                                text bias_rating
0  South Carolina Primary Will Test Hillary Clint...        left
1  State election officials push back on data req...        left
2  Tillerson Says 'All Of The Options Are On The ...        left
3  Baltimore: After riots, protesters and police ...        left
4  Live Blog: Digging Into The #UkraineDocs: Late...        left
                                                text bias_rating
0  Herschel Walker's son accuses father of lying ...      center
1  Coronavirus Sparks Talk of Global Recession an...      center
2  Trump signs budget deal after raising governme...      center
3  Trump to Call for Bipartisanship as He Threate...      center
4  UK set for new PM as Theresa May quits: Theres...      center
                                                text bias_rating
0  FBI gun checks in March break all-time record:...       right
1  NYT updates Kavanaugh 'bombshell' to note accu...       right
2  Another Justice Like G

In [194]:
df = pd.concat([df_left, df_center, df_right], axis = 0)
df['bias_rating'] = df['bias_rating'].replace(['left', 'center', 'right'], [int(0), int(1), int(2)])

df.shape

  df['bias_rating'] = df['bias_rating'].replace(['left', 'center', 'right'], [int(0), int(1), int(2)])


(12000, 2)

In [195]:
tagged_documents = []
for _, row in df.iterrows():
    words = preprocess_text(str(row['text']))
    tags = [row['bias_rating']]
    tagged_documents.append(TaggedDocument(words=words, tags=tags))

max_length = 0
for docs in tagged_documents:
    if (len(docs[0]) > max_length):
        max_length = len(docs[0])
max_length

91

In [196]:
import multiprocessing

cores = multiprocessing.cpu_count()
model = Doc2Vec(dm = 0, vector_size = max_length, negative = 5, hs = 0, sample = 0, min_count = 2, workers = cores)

In [197]:
df['text'] = tagged_documents

model.build_vocab(df['text'].values)
model.train(utils.shuffle(df['text'].values), total_examples = len(df['text'].values), epochs = 30)

In [198]:
tagged_documents = df['text'].dropna()
features, labels = vec_for_learning(model, tagged_documents)

In [208]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.30, random_state = 42)

In [209]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.9113888888888889


In [210]:
from sklearn.model_selection import GridSearchCV

# Selecting an array of hyperparameters for hyperparameter tuning using GridSearchCV() function
parameters = {
  'C' : [0.01, 0.1, 1, 10],
  'kernel' : ['linear', 'rbf'],
  'gamma' : [0.01, 0.1, 1],
}

# Setting up the hyperparameter tunings
svm_cv_model = GridSearchCV( estimator = SVC(),
                         param_grid = parameters,
                         cv = 3,
                         verbose = 2
)

# Running GridSearchCV() on training set
svm_cv_model.fit(X_train, y_train)

# Getting prediction on testing set
y_pred = svm_cv_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM(with 3-CV) classifier is: " + str(round(100 * accuracy, 3)) + "%.")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=   1.0s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.4s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.3s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   3.4s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   0.9s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   3.3s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   3.3s
[CV] END ......................C=0.01, gamma=0.1