In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk



In [2]:
dataset = pd.read_csv("bias_data.csv")
dataset.head()

Unnamed: 0,id,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [3]:
bias_data  = dataset[['id', 'title', 'text', 'bias_rating']]
input_text = bias_data['text'].astype(str)
input_label = bias_data['bias_rating'].astype(str)
bias_data.head()

Unnamed: 0,id,title,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

x_train, x_test, y_train, y_test = train_test_split(input_text, input_label, test_size = 0.30, random_state = 42)

vectorizer = CountVectorizer(stop_words = "english", lowercase = True)
train_vector = vectorizer.fit_transform(x_train)

test_vector = vectorizer.transform(x_test)

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(train_vector, y_train)

y_pred = naive_bayes_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the naive bayes classifier is: " + str(round(100 * accuracy, 3)) + "%.")

The accuracy of the naive bayes classifier is: 44.599%.


In [6]:
from sklearn.svm import SVC

svm_model = SVC(kernel = 'linear')
svm_model.fit(train_vector, y_train)

y_pred = svm_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM classifier is: " + str(round(100 * accuracy, 3)) + "%.")

The accuracy of the SVM classifier is: 41.351%.


In [17]:
from sklearn.model_selection import GridSearchCV

parameters = {
  'C' : [0.01, 0.1, 1, 10],
  'kernel' : ['linear', 'rbf'],
  'gamma' : [0.01, 0.1, 1],
}

svm_cv_model = GridSearchCV( estimator = SVC(),
                         param_grid = parameters,
                         cv = 3,
                         verbose = 2
)

svm_cv_model.fit(train_vector, y_train)

y_pred = svm_cv_model.predict(test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the SVM(with 3-CV) classifier is: " + str(round(100 * accuracy, 3)) + "%.")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  47.7s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  47.6s
[CV] END ..................C=0.01, gamma=0.01, kernel=linear; total time=  49.1s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  55.5s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  54.0s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=  54.7s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  48.3s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  49.5s
[CV] END ...................C=0.01, gamma=0.1, kernel=linear; total time=  47.9s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.01, gamma=0.1

In [8]:
from nltk.corpus import stopwords

nltk.download('stopwords')

filter = set(stopwords.words('english'))

def preprocess(input, filter):
  input = input.lower()
  tags = input.split()
  tags = [i for i in tags if i.isalpha() and i not in filter]
  return " ".join(tags)

input_text = input_text.apply(lambda x: preprocess(x, filter))
input_text.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xenexjoshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    yasmin miller drove home laundromat englewood ...
1    many chicagoans celebrating fourth july barbec...
2    july weekend marred wrong kind spate shootings...
3    treasury secretary janet yellen tuesday warned...
4    treasury secretary janet yellen tuesday told c...
Name: text, dtype: object

In [9]:
from sklearn.calibration import LabelEncoder

encoder = LabelEncoder()
input_label = encoder.fit_transform(input_label)
print(input_label)

[1 0 2 ... 2 1 0]


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

word_count = 20000

tokenizer = Tokenizer(num_words = word_count)
tokenizer.fit_on_texts(input_text)
sequences = tokenizer.texts_to_sequences(input_text)
input_text = pad_sequences(sequences, maxlen = 200)

In [11]:
import tensorflow_hub as hub
x_train, x_test, y_train, y_test = train_test_split(input_text, input_label, test_size = 0.30, random_state = 42)

In [12]:
vocab_size = 1000
embedding_dim = 128
max_length = 200
oov_token = '<OOV>'
batch_size = 64

In [13]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = word_count, output_dim = embedding_dim, input_length = max_length),
                            tf.keras.layers.Conv1D(filters = 64, kernel_size = 3, activation = 'relu'),
                            tf.keras.layers.MaxPooling1D(pool_size = 2),
                            tf.keras.layers.Conv1D(filters = 128, kernel_size = 3, activation = 'relu'),
                            tf.keras.layers.MaxPooling1D(pool_size = 2),
                            tf.keras.layers.Flatten(),
                            tf.keras.layers.Dense(128, activation = 'relu'),
                            tf.keras.layers.Dropout(0.5),
                            tf.keras.layers.Dense(3, activation = 'softmax')])

model.build((None, max_length))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()



In [14]:
from tensorflow.keras.callbacks import EarlyStopping

num_epochs = 5

early_stopping = EarlyStopping(monitor='val_loss', patience = 1.2, restore_best_weights=True)

model.fit(x_train, y_train, epochs = num_epochs, batch_size = batch_size, validation_split = 0.1, callbacks=[early_stopping])

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 61ms/step - accuracy: 0.4575 - loss: 1.0524 - val_accuracy: 0.4846 - val_loss: 1.0279
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 61ms/step - accuracy: 0.4777 - loss: 0.9955 - val_accuracy: 0.4800 - val_loss: 1.0248
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - accuracy: 0.7224 - loss: 0.6324 - val_accuracy: 0.3867 - val_loss: 1.3314
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 61ms/step - accuracy: 0.9392 - loss: 0.1944 - val_accuracy: 0.4038 - val_loss: 2.0310


<keras.src.callbacks.history.History at 0x177a68c70>

In [16]:
_, accuracy= model.evaluate(x_test, y_test)
print("The accuracy of the CNN classifier is: " + str(round(100 * accuracy, 3)) + "%.")

[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4829 - loss: 1.0325
The accuracy of the CNN classifier is: 47.219%.
