In [2]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import stop_words as sw
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package stopwords to /Users/tavin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tavin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']

In [14]:
def lr_predict_test(url):
    data = pd.read_csv(f'{url}/train/{url.split("/")[-1]}_train.csv')
    data['stop_content'] = data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))

    biased_data = data[(data['bias'] == 0) | (data['bias'] == 2)]
    biased_content = biased_data['stop_content']
    biased_bow_vectorizer = CountVectorizer().fit(biased_content)
    biased_bow = biased_bow_vectorizer.transform(biased_content).toarray()

    lr = LogisticRegression(random_state=0).fit(biased_bow, biased_data['bias'])

    test_data = pd.read_csv(f'{url}/test/{url.split("/")[-1]}_test.csv')
    test_data['stop_content'] = test_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    test_biased_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]

    test_biased_data['prediction'] = lr.predict(biased_bow_vectorizer.transform(test_biased_data['stop_content']))

    confusion_matrix = pd.DataFrame(0, ['Predicted Left', 'Predicted Right'], ['Actual Left', 'Actual Right'])
    for prediction, actual in [[0,0], [0,2], [2,0], [2,2]]:
        confusion_matrix[f'Actual {"Left" if actual == 0 else "Right"}'][f'Predicted {"Left" if prediction == 0 else "Right"}'] = \
            test_biased_data[(test_biased_data['prediction'] == prediction) & (test_biased_data['bias'] == actual)].count()['prediction']
    print(f'For {url}:\n{confusion_matrix}\n')
    return confusion_matrix

In [15]:
confusion_matrices = {url: lr_predict_test(url) for url in urls}

For data/elections:
                 Actual Left  Actual Right
Predicted Left          1402           515
Predicted Right          533           961
For data/politics:
                 Actual Left  Actual Right
Predicted Left           414           168
Predicted Right          350           686
For data/white_house:
                 Actual Left  Actual Right
Predicted Left           279           161
Predicted Right          234           408
For data/immigration:
                 Actual Left  Actual Right
Predicted Left           263           117
Predicted Right          128           407
For data/healthcare:
                 Actual Left  Actual Right
Predicted Left           226            78
Predicted Right          196           403
