## Running experiment on top 5 categories

In [1]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


In [2]:
def preprocess_data(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    return x_train, y_train, x_test, y_test
    

In [3]:
def train_model(x_train, y_train):
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    return lr

In [4]:
def predict_on_test_data(model, x_test, y_test):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=2)
    recall = recall_score(y_test, y_pred, pos_label=2)
    f1 = f1_score(y_test, y_pred, pos_label=2)
    cm = confusion_matrix(y_test, y_pred)
    return acc, precision, recall, f1, cm

In [5]:
def runExperiment(url):
    x_train, y_train, x_test, y_test = preprocess_data(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test)

In [6]:
avg_accuracy = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
cms = []

for url in urls:
    topic = url.split("/")[1]
    
    accuracy, precision, recall, f1, cm = runExperiment(url)
    
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    cms.append(cm)
    
    print(f'{topic} Accuracy: {accuracy:.4f}')
    print(f'{topic} Precision: {precision:.4f}')
    print(f'{topic} Recall: {recall:.4f}')
    print(f'{topic} F1: {f1:.4f}\n\n')
    
avg_accuracy /= len(urls)
avg_recall /= len(urls)
avg_precision /= len(urls)
avg_f1 /= len(urls)

print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1: {avg_f1:.4f}')

elections Accuracy: 0.6579
elections Precision: 0.6696
elections Recall: 0.4133
elections F1: 0.5111


politics Accuracy: 0.6619
politics Precision: 0.6313
politics Recall: 0.8642
politics F1: 0.7296


white_house Accuracy: 0.6451
white_house Precision: 0.6430
white_house Recall: 0.7311
white_house F1: 0.6842


immigration Accuracy: 0.6623
immigration Precision: 0.6475
immigration Recall: 0.9008
immigration F1: 0.7534


healthcare Accuracy: 0.6235
healthcare Precision: 0.5912
healthcare Recall: 0.9501
healthcare F1: 0.7289


Average Accuracy: 0.6501
Average Precision: 0.6365
Average Recall: 0.7719
Average F1: 0.6814


In [7]:
cm = sum(cms)

data_cm = pd.DataFrame(cm, columns=['Predicted Left', 'Predicted Right'], index=['Actual Left', 'Actual Right'])
data_cm

Unnamed: 0,Predicted Left,Predicted Right
Actual Left,2489,1536
Actual Right,1211,2693


In [8]:
cols = ['Accuracy', 'Precision', 'Recall', 'F1']
data = [[avg_accuracy, avg_precision, avg_recall, avg_f1]]
data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression'])
data_metrics

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.650135,0.636507,0.771884,0.681436


## Running experiment on top 5 categories using PCA

In [9]:
def preprocess_data_with_PCA(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    pca = PCA(n_components=0.9)
    transformed_x_train = pca.fit_transform(x_train.toarray())
    transformed_x_test = pca.transform(x_test.toarray())
    
    return transformed_x_train, y_train, transformed_x_test, y_test
    

In [10]:
def runExperiment_with_pca(url):
    x_train, y_train, x_test, y_test = preprocess_data_with_PCA(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test), x_test

In [11]:
avg_accuracy = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
cms = []

for url in urls:
    topic = url.split("/")[1]
    
    experiment_results, x_test = runExperiment_with_pca(url)
    print(f"Components of X_test: {x_test.shape[1]}")
    accuracy, precision, recall, f1, cm = experiment_results
    
    #  accuracy, precision, recall, f1, cm, x_test = runExperiment(url)
    
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    cms.append(cm)
    
    print(f'{topic} Accuracy: {accuracy:.4f}')
    print(f'{topic} Precision: {precision:.4f}')
    print(f'{topic} Recall: {recall:.4f}')
    print(f'{topic} F1: {f1:.4f}\n\n')
    
avg_accuracy /= len(urls)
avg_recall /= len(urls)
avg_precision /= len(urls)
avg_f1 /= len(urls)

print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1: {avg_f1:.4f}')

Components of X_test: 611
elections Accuracy: 0.6558
elections Precision: 0.6645
elections Recall: 0.4133
elections F1: 0.5096


Components of X_test: 314
politics Accuracy: 0.6539
politics Precision: 0.6265
politics Recall: 0.8525
politics F1: 0.7222


Components of X_test: 214
white_house Accuracy: 0.6405
white_house Precision: 0.6415
white_house Recall: 0.7170
white_house F1: 0.6772


Components of X_test: 179
immigration Accuracy: 0.6612
immigration Precision: 0.6474
immigration Recall: 0.8969
immigration F1: 0.7520


Components of X_test: 173
healthcare Accuracy: 0.6190
healthcare Precision: 0.5886
healthcare Recall: 0.9459
healthcare F1: 0.7257


Average Accuracy: 0.6461
Average Precision: 0.6337
Average Recall: 0.7651
Average F1: 0.6773


In [12]:
cm = sum(cms)

data_cm = pd.DataFrame(cm, columns=['Predicted Left', 'Predicted Right'], index=['Actual Left', 'Actual Right'])
data_cm

Unnamed: 0,Predicted Left,Predicted Right
Actual Left,2481,1544
Actual Right,1233,2671


In [13]:
cols = ['Accuracy', 'Precision', 'Recall', 'F1']
data = [[avg_accuracy, avg_precision, avg_recall, avg_f1]]
new_data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression with PCA'])

result = pd.concat([data_metrics, new_data_metrics])
result

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.650135,0.636507,0.771884,0.681436
Logistic Regression with PCA,0.646089,0.6337,0.765136,0.677337


## Running experiment on top 5 categories using SparsePCA

In [14]:
from sklearn.decomposition import TruncatedSVD

In [15]:
def preprocess_data_with_SPCA(url):
    topic = url.split("/")[1]
    train_data = pd.read_csv(f'{url}/train/{topic}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{topic}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    # spca = SparsePCA(n_components=400)
    # transformed_x_train = spca.fit_transform(x_train.toarray())
    # transformed_x_test = spca.transform(x_test.toarray())
    
    num_components = {'elections': 600,  'politics': 311, 'white_house': 211, 'immigration': 176, 'healthcare': 170}

    svd = TruncatedSVD(n_components=num_components[topic])
    transformed_x_train = svd.fit_transform(x_train)
    transformed_x_test = svd.transform(x_test)
    
    # explained_variance = (svd.singular_values_ ** 2) / (np.sum(svd.singular_values_ ** 2))
    # cumulative_variance = np.cumsum(explained_variance)
    # n_componenet_needed = np.where(cumulative_variance >= 0.90)[0][0] + 1
    # topic = url.split("/")[1]
    # print(f"Number of components needed to capture 90% variance for topic {topic}: {n_componenet_needed}")
    
    return transformed_x_train, y_train, transformed_x_test, y_test
    

In [16]:
def runExperiment_with_spca(url):
    x_train, y_train, x_test, y_test = preprocess_data_with_SPCA(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test), x_test

In [17]:
avg_accuracy = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
cms = []

for url in urls:
    topic = url.split("/")[1]
    
    experiment_results, x_test = runExperiment_with_spca(url)
    print(f"Components of X_test: {x_test.shape[1]}")
    accuracy, precision, recall, f1, cm = experiment_results
    
    #  accuracy, precision, recall, f1, cm, x_test = runExperiment(url)
    
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    cms.append(cm)
    
    print(f'{topic} Accuracy: {accuracy:.4f}')
    print(f'{topic} Precision: {precision:.4f}')
    print(f'{topic} Recall: {recall:.4f}')
    print(f'{topic} F1: {f1:.4f}\n\n')
    
avg_accuracy /= len(urls)
avg_recall /= len(urls)
avg_precision /= len(urls)
avg_f1 /= len(urls)

print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1: {avg_f1:.4f}')

Components of X_test: 600
elections Accuracy: 0.6567
elections Precision: 0.6656
elections Recall: 0.4153
elections F1: 0.5115


Components of X_test: 311
politics Accuracy: 0.6564
politics Precision: 0.6287
politics Recall: 0.8525
politics F1: 0.7237


Components of X_test: 211
white_house Accuracy: 0.6442
white_house Precision: 0.6433
white_house Recall: 0.7258
white_house F1: 0.6821


Components of X_test: 176
immigration Accuracy: 0.6634
immigration Precision: 0.6475
immigration Recall: 0.9046
immigration F1: 0.7548


Components of X_test: 170
healthcare Accuracy: 0.6190
healthcare Precision: 0.5886
healthcare Recall: 0.9459
healthcare F1: 0.7257


Average Accuracy: 0.6479
Average Precision: 0.6347
Average Recall: 0.7688
Average F1: 0.6795


In [18]:
cm = sum(cms)

data_cm = pd.DataFrame(cm, columns=['Predicted Left', 'Predicted Right'], index=['Actual Left', 'Actual Right'])
data_cm

Unnamed: 0,Predicted Left,Predicted Right
Actual Left,2482,1543
Actual Right,1221,2683


In [19]:
cols = ['Accuracy', 'Precision', 'Recall', 'F1']
data = [[avg_accuracy, avg_precision, avg_recall, avg_f1]]
new_data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression with TruncatedSVD'])

result = pd.concat([result, new_data_metrics])
result

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.650135,0.636507,0.771884,0.681436
Logistic Regression with PCA,0.646089,0.6337,0.765136,0.677337
Logistic Regression with TruncatedSVD,0.647936,0.634742,0.768826,0.679533
