In [6]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [7]:
import stop_words as sw
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
pd.options.mode.chained_assignment = None

In [8]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']

In [9]:
def kmeans_predict_test(url):
    data = pd.read_csv(f'{url}/train/{url.split("/")[-1]}_train.csv')
    data['stop_content'] = data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))

    biased_data = data[(data['bias'] == 0) | (data['bias'] == 2)]
    biased_content = biased_data['stop_content']
    biased_bow_vectorizer = CountVectorizer().fit(biased_content)
    biased_bow = biased_bow_vectorizer.transform(biased_content).toarray()

    kmeans = KMeans(n_clusters=2, init='random').fit(biased_bow)

    test_data = pd.read_csv(f'{url}/test/{url.split("/")[-1]}_test.csv')
    test_data['stop_content'] = test_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    test_biased_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)]

    test_biased_data['prediction'] = kmeans.predict(biased_bow_vectorizer.transform(test_biased_data['stop_content']))

    confusion_matrix = pd.DataFrame(0, ['Predicted 1', 'Predicted 0'], ['Actual Left', 'Actual Right'])
    for prediction, actual in [[0,0], [0,2], [1,0], [1,2]]:
        confusion_matrix[f'Actual {"Left" if actual == 0 else "Right"}'][f'Predicted {prediction}'] = \
            test_biased_data[(test_biased_data['prediction'] == prediction) & (test_biased_data['bias'] == actual)].count()['prediction']
    print(f'For {url}:\n{confusion_matrix}\n')
    return confusion_matrix

In [11]:
confusion_matrices = {url: kmeans_predict_test(url) for url in urls}

For data/elections:
             Actual Left  Actual Right
Predicted 1         1594          1352
Predicted 0          341           124

For data/politics:
             Actual Left  Actual Right
Predicted 1          250           117
Predicted 0          514           737

For data/white_house:
             Actual Left  Actual Right
Predicted 1            1             0
Predicted 0          512           569

For data/immigration:
             Actual Left  Actual Right
Predicted 1          371           521
Predicted 0           20             3

For data/healthcare:
             Actual Left  Actual Right
Predicted 1          115            45
Predicted 0          307           436

