In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score


In [2]:
train_data = pd.read_csv('data/elections/train/elections_train.csv')
train_data.head()

Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,elections,The Daily Wire,2,https://www.dailywire.com/news/52178/tulsi-gab...,Tulsi Gabbard Qualifies For Upcoming President...,,,Rep. Tulsi Gabbard ( D-HI ) has officially met...,Rep. Tulsi Gabbard (D-HI) has officially met t...,www.dailywire.com,right,diXbkrezTxsgcoJp
1,elections,Washington Times,2,https://www.washingtontimes.com/news/2019/jun/...,Candidate Hillary Clinton endorsed idea of pol...,2019-06-13,Rowan Scarborough,Hillary Clinton has endorsed the idea of obtai...,Hillary Clinton has endorsed the idea of obtai...,www.washingtontimes.com,right,0ERzAbVMeKjFGiox
2,elections,The Hill,1,https://thehill.com/homenews/campaign/431181-t...,Trump unleashing digital juggernaut ahead of 2020,2019-02-25,,President Trump Donald John TrumpSessions says...,President Trump Donald John TrumpSessions says...,www.thehill.com,center,641HsrEBG9vsQjQT
3,elections,Reuters,1,https://www.reuters.com/article/us-usa-electio...,Russia looms large as U.S. election officials ...,2018-02-19,Dustin Volz,WASHINGTON ( ███ ) - Ten months before the Uni...,WASHINGTON (Reuters) - Ten months before the U...,www.reuters.com,center,yJimQY11fGmT0Dyw
4,elections,Salon,0,http://www.salon.com/2016/03/11/the_republican...,"After the latest debacle, we agree with Donald...",2016-03-11,Elias Isquith,If you ’ re not a masochist — or even if you a...,If you’re not a masochist — or even if you are...,www.salon.com,left,bQm3oIyjS1aGUXP4


In [3]:
train_data['content'] = train_data['content'].apply(lambda x: x.lower())

In [4]:
train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]

In [5]:
x_train = train_data['content'].values
y_train = train_data['bias'].values

In [6]:
# TfidfVectorizer = TfidfVectorizer(stop_words='english')
# x_train = TfidfVectorizer.fit_transform(x_train)

In [7]:
x_train = x_train.toarray()
pca = PCA(n_components=400)
x_train = pca.fit_transform(x_train)

In [8]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [9]:
test_data = pd.read_csv('data/elections/test/elections_test.csv')
test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
test_data['content'] = test_data['content'].apply(lambda x: x.lower())
x_test = test_data['content'].values
y_test = test_data['bias'].values

In [10]:
x_test = TfidfVectorizer.transform(x_test)
x_test = x_test.toarray()
x_test = pca.transform(x_test)

In [11]:
y_pred = lr.predict(x_test)

In [12]:
accuracy_score(y_test, y_pred)

0.6558194077982996

## Running experiment on top 5 categories

In [13]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score


In [19]:
def preprocess_data(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    return x_train, y_train, x_test, y_test
    

In [20]:
def train_model(x_train, y_train):
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    return lr

In [21]:
def predict_on_test_data(model, x_test, y_test):
    y_pred = model.predict(x_test)
    return y_pred, accuracy_score(y_test, y_pred)

In [22]:
def runExperiment(url):
    x_train, y_train, x_test, y_test = preprocess_data(url)
    model = train_model(x_train, y_train)
    y_pred, accuracy = predict_on_test_data(model, x_test, y_test)
    return accuracy

In [23]:
avg_accuracy = 0
for url in urls:
    topic = url.split("/")[1]
    accuracy = runExperiment(url)
    avg_accuracy += accuracy
    print(f'{topic} Accuracy: {accuracy:.4f}')
    
avg_accuracy /= len(urls)
print(f'Average Accuracy: {avg_accuracy:.4f}')

elections Accuracy: 0.6579
politics Accuracy: 0.6619
white_house Accuracy: 0.6451
immigration Accuracy: 0.6623
healthcare Accuracy: 0.6235
Average Accuracy: 0.6501
