## Running experiment on top 5 categories

In [24]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from datetime import datetime
import os

In [31]:
runid = datetime.now().strftime("%d-%m-%y%H:%M:%S")

In [32]:
def init_saveplace(runid):
    route = os.path.join('.', 'runs', f'lr-run-{runid}')
    os.makedirs(route, exist_ok=True)
    return route

saveplace = init_saveplace(runid)

In [2]:
def preprocess_data(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    return x_train, y_train, x_test, y_test
    

In [3]:
def train_model(x_train, y_train):
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    return lr

In [4]:
def predict_on_test_data(model, x_test, y_test):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=2)
    recall = recall_score(y_test, y_pred, pos_label=2)
    f1 = f1_score(y_test, y_pred, pos_label=2)
    cm = confusion_matrix(y_test, y_pred)
    return acc, precision, recall, f1, cm

In [5]:
def runExperiment(url):
    x_train, y_train, x_test, y_test = preprocess_data(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test)

In [38]:
scores = pd.DataFrame(None, urls, ['F1 Macro', 'F1 Micro', 'Precision', 'Recall'])
cms = pd.DataFrame(None, urls, ['True Left', 'False Left', 'True Right', "False Right"])

for url in urls:
    topic = url.split("/")[1]
    
    accuracy, precision, recall, f1, cm = runExperiment(url)

    scores['F1 Macro'][url] = f1
    scores['F1 Micro'][url] = accuracy
    scores['Precision'][url] = precision
    scores['Recall'][url] = recall
    
    cms['True Left'][url] = cm[0,0]
    cms['False Left'][url] = cm[1,0]
    cms['True Right'][url] = cm[1,1]
    cms['False Right'][url] = cm[0,1]
    
    print(f'{topic} Accuracy: {accuracy:.5f}')
    print(f'{topic} Precision: {precision:.5f}')
    print(f'{topic} Recall: {recall:.5f}')
    print(f'{topic} F1: {f1:.5f}\n\n')

averages = scores.mean(axis=0)

print(f'Average Accuracy: {averages["F1 Micro"]:.5f}')
print(f'Average Precision: {averages["Precision"]:.5f}')
print(f'Average Recall: {averages["Recall"]:.5f}')
print(f'Average F1: {averages["F1 Macro"]:.5f}')

elections Accuracy: 0.65846
elections Precision: 0.66995
elections Recall: 0.41531
elections F1: 0.51276


politics Accuracy: 0.66069
politics Precision: 0.63068
politics Recall: 0.86183
politics F1: 0.72835


white_house Accuracy: 0.64510
white_house Precision: 0.64297
white_house Recall: 0.73111
white_house F1: 0.68421


immigration Accuracy: 0.66230
immigration Precision: 0.64746
immigration Recall: 0.90076
immigration F1: 0.75339


healthcare Accuracy: 0.62237
healthcare Precision: 0.59067
healthcare Recall: 0.94802
healthcare F1: 0.72785


Average Accuracy: 0.64978
Average Precision: 0.63635
Average Recall: 0.77141
Average F1: 0.68131


In [56]:
combined_info = pd.concat([scores, cms], axis=1)
combined_info.to_csv(os.path.join(saveplace, f'lr_nodr_stats.csv'))

In [48]:
cols = ['F1 Macro', 'F1 Micro', 'Precision', 'Recall']
data = [averages[cols]]
data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression'])
data_metrics

Unnamed: 0,F1 Macro,F1 Micro,Precision,Recall
Logistic Regression,0.681313,0.649783,0.636345,0.771407


## Running experiment on top 5 categories using PCA

In [57]:
def preprocess_data_with_PCA(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    """ 
    We utilized the default version of PCA from sklearn instead of using class-based code. 
    This allows us to use the same formatting as a slot-in replacement for the other dimensionality 
    reduction techniques we evaluated or tested in this project, like Sparse PCA and Truncated SVD. 
    Furthermore, due to the scale and sparsity of our text data, utilizing the sklearn PCA, which is 
    optimized for speed, made our computations more efficient. 
    """
    
    pca = PCA(n_components=0.9)
    transformed_x_train = pca.fit_transform(x_train.toarray())
    transformed_x_test = pca.transform(x_test.toarray())
    
    return transformed_x_train, y_train, transformed_x_test, y_test
    

In [58]:
def runExperiment_with_pca(url):
    x_train, y_train, x_test, y_test = preprocess_data_with_PCA(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test), x_test

In [60]:
scores = pd.DataFrame(None, urls, ['F1 Macro', 'F1 Micro', 'Precision', 'Recall', 'Components'])
cms = pd.DataFrame(None, urls, ['True Left', 'False Left', 'True Right', "False Right"])

for url in urls:
    topic = url.split("/")[1]
    
    experiment_results, x_test = runExperiment_with_pca(url)
    print(f"Components of X_test: {x_test.shape[1]}")
    scores['Components'][url] = x_test.shape[1]

    accuracy, precision, recall, f1, cm = experiment_results
    
    scores['F1 Macro'][url] = f1
    scores['F1 Micro'][url] = accuracy
    scores['Precision'][url] = precision
    scores['Recall'][url] = recall
    
    cms['True Left'][url] = cm[0,0]
    cms['False Left'][url] = cm[1,0]
    cms['True Right'][url] = cm[1,1]
    cms['False Right'][url] = cm[0,1]
    
    print(f'{topic} Accuracy: {accuracy:.4f}')
    print(f'{topic} Precision: {precision:.4f}')
    print(f'{topic} Recall: {recall:.4f}')
    print(f'{topic} F1: {f1:.4f}\n\n')
    
averages = scores.mean(axis=0)

print(f'Average Accuracy: {averages["F1 Micro"]:.5f}')
print(f'Average Precision: {averages["Precision"]:.5f}')
print(f'Average Recall: {averages["Recall"]:.5f}')
print(f'Average F1: {averages["F1 Macro"]:.5f}')

Components of X_test: 611
elections Accuracy: 0.6564
elections Precision: 0.6652
elections Recall: 0.4146
elections F1: 0.5109


Components of X_test: 314
politics Accuracy: 0.6539
politics Precision: 0.6265
politics Recall: 0.8525
politics F1: 0.7222


Components of X_test: 214
white_house Accuracy: 0.6414
white_house Precision: 0.6425
white_house Recall: 0.7170
white_house F1: 0.6777


Components of X_test: 179
immigration Accuracy: 0.6612
immigration Precision: 0.6474
immigration Recall: 0.8969
immigration F1: 0.7520


Components of X_test: 173
healthcare Accuracy: 0.6202
healthcare Precision: 0.5894
healthcare Recall: 0.9459
healthcare F1: 0.7263


Average Accuracy: 0.64661
Average Precision: 0.63420
Average Recall: 0.76541
Average F1: 0.67781


In [61]:
combined_info = pd.concat([scores, cms], axis=1)
combined_info.to_csv(os.path.join(saveplace, f'lr_pca_stats.csv'))

In [62]:
cols = ['F1 Macro', 'F1 Micro', 'Precision', 'Recall']
data = [averages[cols]]
new_data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression with PCA'])

result = pd.concat([data_metrics, new_data_metrics])
result

Unnamed: 0,F1 Macro,F1 Micro,Precision,Recall
Logistic Regression,0.681313,0.649783,0.636345,0.771407
Logistic Regression with PCA,0.677814,0.646612,0.634201,0.765407


## Running experiment on top 5 categories using TruncatedSVD

In [63]:
from sklearn.decomposition import TruncatedSVD

In [64]:
def preprocess_data_with_SPCA(url):
    topic = url.split("/")[1]
    train_data = pd.read_csv(f'{url}/train/{topic}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)]
    train_data['content'] = train_data['content'].apply(lambda x: x.lower())
    
    test_data = pd.read_csv(f'{url}/test/{topic}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]
    test_data['content'] = test_data['content'].apply(lambda x: x.lower())
    
    tfid = TfidfVectorizer(stop_words='english')
    
    x_train = train_data['content'].values
    x_train = tfid.fit_transform(x_train)
    y_train = train_data['bias'].values
    
    x_test = test_data['content'].values
    x_test = tfid.transform(x_test)
    y_test = test_data['bias'].values
    
    # spca = SparsePCA(n_components=400)
    # transformed_x_train = spca.fit_transform(x_train.toarray())
    # transformed_x_test = spca.transform(x_test.toarray())
    
    num_components = {'elections': 600,  'politics': 311, 'white_house': 211, 'immigration': 176, 'healthcare': 170}

    svd = TruncatedSVD(n_components=num_components[topic])
    transformed_x_train = svd.fit_transform(x_train)
    transformed_x_test = svd.transform(x_test)
    
    # explained_variance = (svd.singular_values_ ** 2) / (np.sum(svd.singular_values_ ** 2))
    # cumulative_variance = np.cumsum(explained_variance)
    # n_componenet_needed = np.where(cumulative_variance >= 0.90)[0][0] + 1
    # topic = url.split("/")[1]
    # print(f"Number of components needed to capture 90% variance for topic {topic}: {n_componenet_needed}")
    
    return transformed_x_train, y_train, transformed_x_test, y_test
    

In [65]:
def runExperiment_with_spca(url):
    x_train, y_train, x_test, y_test = preprocess_data_with_SPCA(url)
    model = train_model(x_train, y_train)
    return predict_on_test_data(model, x_test, y_test), x_test

In [66]:
scores = pd.DataFrame(None, urls, ['F1 Macro', 'F1 Micro', 'Precision', 'Recall', 'Components'])
cms = pd.DataFrame(None, urls, ['True Left', 'False Left', 'True Right', "False Right"])

for url in urls:
    topic = url.split("/")[1]
    
    experiment_results, x_test = runExperiment_with_spca(url)
    print(f"Components of X_test: {x_test.shape[1]}")
    scores['Components'][url] = x_test.shape[1]

    accuracy, precision, recall, f1, cm = experiment_results
    
    scores['F1 Macro'][url] = f1
    scores['F1 Micro'][url] = accuracy
    scores['Precision'][url] = precision
    scores['Recall'][url] = recall
    
    cms['True Left'][url] = cm[0,0]
    cms['False Left'][url] = cm[1,0]
    cms['True Right'][url] = cm[1,1]
    cms['False Right'][url] = cm[0,1]
    
    print(f'{topic} Accuracy: {accuracy:.4f}')
    print(f'{topic} Precision: {precision:.4f}')
    print(f'{topic} Recall: {recall:.4f}')
    print(f'{topic} F1: {f1:.4f}\n\n')
    
averages = scores.mean(axis=0)

print(f'Average Accuracy: {averages["F1 Micro"]:.5f}')
print(f'Average Precision: {averages["Precision"]:.5f}')
print(f'Average Recall: {averages["Recall"]:.5f}')
print(f'Average F1: {averages["F1 Macro"]:.5f}')

Components of X_test: 600
elections Accuracy: 0.6579
elections Precision: 0.6692
elections Recall: 0.4140
elections F1: 0.5115


Components of X_test: 311
politics Accuracy: 0.6545
politics Precision: 0.6264
politics Recall: 0.8560
politics F1: 0.7234


Components of X_test: 211
white_house Accuracy: 0.6386
white_house Precision: 0.6378
white_house Recall: 0.7241
white_house F1: 0.6782


Components of X_test: 176
immigration Accuracy: 0.6612
immigration Precision: 0.6462
immigration Recall: 0.9027
immigration F1: 0.7532


Components of X_test: 170
healthcare Accuracy: 0.6235
healthcare Precision: 0.5914
healthcare Recall: 0.9480
healthcare F1: 0.7284


Average Accuracy: 0.64714
Average Precision: 0.63420
Average Recall: 0.76894
Average F1: 0.67894


In [67]:
combined_info = pd.concat([scores, cms], axis=1)
combined_info.to_csv(os.path.join(saveplace, f'lr_tsvd_stats.csv'))

In [68]:
cols = ['F1 Macro', 'F1 Micro', 'Precision', 'Recall']
data = [averages[cols]]
new_data_metrics = pd.DataFrame(data, columns=cols, index=['Logistic Regression with TruncatedSVD'])

result = pd.concat([result, new_data_metrics])
result

Unnamed: 0,F1 Macro,F1 Micro,Precision,Recall
Logistic Regression,0.681313,0.649783,0.636345,0.771407
Logistic Regression with PCA,0.677814,0.646612,0.634201,0.765407
Logistic Regression with TruncatedSVD,0.678945,0.647139,0.6342,0.768941


In [69]:
result.to_csv(os.path.join(saveplace, f'summarized_results.csv'))