In [83]:
# Importing essential modules for the machine-learning process
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk

In [84]:
# Reading the source dataset obtained from source: https://github.com/irgroup/Qbias
dataset = pd.read_csv("bias_data.csv")
dataset.head()

Unnamed: 0,id,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [99]:
# Separating the essential data columns for the text-based media bias classification
bias_data  = dataset[['id', 'title', 'text', 'bias_rating']]
input_text = bias_data['text'].astype(str)
input_label = bias_data['bias_rating'].astype(str)
bias_data.head()

Unnamed: 0,id,title,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [86]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Splitting data in 70-30 training and testing sets
x_train, x_test, y_train, y_test = train_test_split(input_text, input_label, test_size = 0.30, random_state = 42)

# Generating bag-of-words for Naive-Bayes and SVM, after pre-processing by removing standard English stopwords and converting 
# text to lowercase
vectorizer = CountVectorizer(stop_words = "english", lowercase = True)
train_vector = vectorizer.fit_transform(x_train)
test_vector = vectorizer.transform(x_test)

In [87]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

# Training the Multinomial Naive Bayes Model on the vectorized bag-of-words training set
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(train_vector, y_train)

# Generating prediction on the vectorized testing set
y_pred = naive_bayes_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the naive bayes classifier is: " + str(round(100 * accuracy, 3)) + "%.")

The accuracy of the naive bayes classifier is: 44.599%.


In [89]:
from sklearn.svm import SVC

# Training an SVM with linear kernel on the training data
# Using a linear kernel to leverage its simplicity and faster training time compared to more fine-tuned SVMs
svm_model = SVC(kernel = 'linear')
svm_model.fit(train_vector, y_train)

# Getting linear SVM predictions on the testing set
y_pred = svm_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM classifier is: " + str(round(100 * accuracy, 3)) + "%.")

The accuracy of the SVM classifier is: 41.351%.


In [17]:
from sklearn.model_selection import GridSearchCV

# Selecting an array of hyperparameters for hyperparameter tuning using GridSearchCV() function
parameters = {
  'C' : [0.01, 0.1, 1, 10],
  'kernel' : ['linear', 'rbf'],
  'gamma' : [0.01, 0.1, 1],
}

# Setting up the hyperparameter tunings
svm_cv_model = GridSearchCV( estimator = SVC(),
                         param_grid = parameters,
                         cv = 3,
                         verbose = 2
)

# Running GridSearchCV() on training set
svm_cv_model.fit(train_vector, y_train)

# Getting prediction on testing set
y_pred = svm_cv_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM(with 3-CV) classifier is: " + str(round(100 * accuracy, 3)) + "%.")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  47.7s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  47.6s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  49.1s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  55.5s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  54.0s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  54.7s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  48.3s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  49.5s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  47.9s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.01, gamma=0.1

In [100]:
from nltk.corpus import stopwords

# Downloading stopwords form nltk.corpus for pre-processing input_text for CNN
nltk.download('stopwords')
filter = set(stopwords.words('english'))

# Pre-processing function that removes the stopwords, and converts all words to lowercase for feeding into the CNN
def preprocess(input, filter):
  input = input.lower()
  tags = input.split()
  tags = [i for i in tags if i.isalpha() and i not in filter]
  return " ".join(tags)

# Applying the custom pre-processing function on the input dataset
input_text = input_text.apply(lambda x: preprocess(x, filter))
input_text.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xenexjoshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    yasmin miller drove home laundromat englewood ...
1    many chicagoans celebrating fourth july barbec...
2    july weekend marred wrong kind spate shootings...
3    treasury secretary janet yellen tuesday warned...
4    treasury secretary janet yellen tuesday told c...
Name: text, dtype: object

In [101]:
from sklearn.calibration import LabelEncoder

# Initializing the LabelEncoder() to encode the labels on the dataset
encoder = LabelEncoder()

# Encoding input labels into integers(0, 1, 2)
input_label = encoder.fit_transform(input_label)
print(input_label)

[1 0 2 ... 2 1 0]


In [102]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max word-count for the CNN tokenizer
word_count = 20000

# Initializing the Tokenizer, and fitting the input text using the Tokeinizer()
tokenizer = Tokenizer(num_words = word_count)
tokenizer.fit_on_texts(input_text)

# Generating tokenized sequence from tokenizer, and padding sequence to ensure constant length inputs
sequences = tokenizer.texts_to_sequences(input_text)
input_text = pad_sequences(sequences, maxlen = 200)
print(input_text)

[[    0     0     0 ... 14419    12  4215]
 [    0     0     0 ...   433   230   524]
 [    0     0     0 ...  4012  1515   120]
 ...
 [    0     0     0 ...   904  1212    66]
 [    0     0     0 ...   558   919   670]
 [    0     0     0 ...   102    15   782]]


In [103]:
# Splitting bias_data.csv into 70-30 training set and testing set
x_train, x_test, y_train, y_test = train_test_split(input_text, input_label, test_size = 0.30, random_state = 42)

In [104]:
# Setting CNN parameters
vocab_size = 20000
embedding_dim = 128
max_length = 200
batch_size = 32

In [105]:
# Implementing a single convolution layer CNN with Dropout layer to avoid overfitting
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = word_count, output_dim = embedding_dim, input_length = max_length),
                            tf.keras.layers.Conv1D(filters = 32, kernel_size = 5, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
                            tf.keras.layers.GlobalMaxPooling1D(),
                            tf.keras.layers.Dense(32, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
                            tf.keras.layers.Dropout(0.5),
                            tf.keras.layers.Dense(3, activation = 'softmax')])

# Building CNN model based on the set CNN parameters 
model.build((None, max_length))

# Compiling CNN model
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

# Generating CNN model report
model.summary()



In [106]:
from tensorflow.keras.callbacks import EarlyStopping

# Setting epoch count for the CNN training
num_epochs = 5

# Implementing an early-stopping to avoid overfitting onto the training set
early_stopping = EarlyStopping(monitor='val_loss', 
                              patience = 1.5,
                              restore_best_weights=True)

# Initiating the CNN training
model.fit(x_train, y_train, epochs = num_epochs, batch_size = batch_size, validation_split = 0.1, callbacks=[early_stopping])

Epoch 1/5
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.4575 - loss: 1.3454 - val_accuracy: 0.4846 - val_loss: 1.0439
Epoch 2/5
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.4777 - loss: 1.0435 - val_accuracy: 0.4846 - val_loss: 1.0396
Epoch 3/5
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 30ms/step - accuracy: 0.4739 - loss: 1.0427 - val_accuracy: 0.4846 - val_loss: 1.0361
Epoch 4/5
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.4682 - loss: 1.0410 - val_accuracy: 0.4846 - val_loss: 1.0390
Epoch 5/5
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.4704 - loss: 1.0202 - val_accuracy: 0.4800 - val_loss: 1.0526


<keras.src.callbacks.history.History at 0x14ac76af0>

In [107]:
# Generating the prediction report of the CNN on the testing set
_, accuracy= model.evaluate(x_test, y_test)
print("The accuracy of the CNN classifier is: " + str(round(100 * accuracy, 3)) + "%.")

[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4752 - loss: 1.0391
The accuracy of the CNN classifier is: 46.913%.


In [108]:
# Separating the dataset into columns for news source based prediction
source_bias = dataset[['id', 'source', 'bias_rating']]
input_source = source_bias['source'].astype(str)
input_tag = source_bias['bias_rating'].astype(str)
source_bias.head()

Unnamed: 0,id,source,bias_rating
0,0,New York Times (News),left
1,1,Chicago Tribune,center
2,2,New York Post (News),right
3,3,The Epoch Times,right
4,4,Washington Post,left


In [109]:
# Initialing a LabelEncoder() to encode the labels
encoder_tags = LabelEncoder()

# Fitting the encoder on the bias labels
encoder_tags.fit_transform(input_tag)

# Splitting bias_data.csv into 70-30 training set and testing set
x_tr, x_te, y_tr, y_te = train_test_split(input_source, input_tag, test_size = 0.30, random_state = 42)

# Applying the encoder on the training and testing set labels
y_tr = encoder_tags.fit(y_tr)
y_te = encoder_tags.fit(y_te)

In [110]:
# Initializing bag-of-words with english stopwords and lowercased data
vectorizer = CountVectorizer(stop_words = "english", lowercase = True)

# Generating a vectorized bag-of-words on the training and testing set
train_vector = vectorizer.fit_transform(x_tr)
test_vector = vectorizer.transform(x_te)

# Initializing a Multinomial Naive-Bayes classifier
naive_bayes_source = MultinomialNB()

# Training the naive-bayes model on the training set
naive_bayes_source.fit(train_vector, y_train)

# Predicting the trained model on the testing set
y_pred = naive_bayes_source.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the naive bayes classifier is: " + str(round(100 * accuracy, 3)) + "%.")

The accuracy of the naive bayes classifier is: 96.415%.


In [115]:
# Implementing a callable classifer function that classifies the news source to its affiliated political bias
def predict_bias(source):
  src = vectorizer.transform([source])
  result = naive_bayes_source.predict(src)
  print(result)
  bias = encoder_tags.inverse_transform(result)[0]
  output_str = "The predicted bias of " + source + " is " + str(bias) + "."
  return output_str

print(predict_bias("BBC News"))
print(predict_bias("New York Times"))
print(predict_bias("Fox News"))

[0]
The predicted bias of BBC News is center.
[1]
The predicted bias of New York Times is left.
[2]
The predicted bias of Fox News is right.
