In [None]:
import sys
sys.path.insert(0, '../../')

In [None]:
from notebooks.utils import load_corpus, load_splits, load_node_features
import pandas as pd

# Load corpus, features and splits

In [None]:
df_corpus = pd.DataFrame(load_corpus(data_year='2018'))
df_features = load_node_features()
splits = load_splits(data_year='2018')

df_corpus.head()

In [None]:
df_features.head()

In [None]:
splits.keys()

In [None]:
total_df = pd.merge(df_features, df_corpus[['source_url_normalized', 'fact', 'bias']], left_on='site', right_on='source_url_normalized', how='right')
total_df.head()

# Fill the missing values with mean

In [None]:
column_means = total_df.mean()
print(column_means)

In [None]:
total_df = total_df.fillna(column_means)
total_df = total_df.drop(['site', 'alexa_rank', 'daily_pageviews_per_visitor'], axis=1)
total_df = total_df[['source_url_normalized', 'daily_time_on_site', 'total_sites_linking_in', 'bounce_rate', 'bias', 'fact']]

total_df.head()

In [None]:
from typing import Dict, List
import numpy as np
from sklearn.semi_supervised import LabelPropagation
from train import calculate_metrics

def train_model(splits: Dict[str, Dict[str, List[str]]], features: Dict[str, Dict[str, List[float]]], task):
    all_urls = []
    actual = []
    predicted = []

    if task=="fact":
        other_task = "bias"

    if task=="bias":
        other_task = "fact"

    i = 0
    num_folds = len(splits)
    for f in range(num_folds):
        # get the training and testing media for the current fold
        urls = {
            "train": splits[str(f)]["train"],
            "test": splits[str(f)]["test"],
        }

        all_urls.extend(splits[str(f)]["test"])


        # concatenate the different features/labels for the training sources
        X_train = features[features["source_url_normalized"].isin(urls["train"])]
        X_train = X_train.drop(['source_url_normalized', task, other_task], axis = 1)
        #print(X_train.head())
        y_train = np.asarray(features[features["source_url_normalized"].isin(urls["train"])][task])
        #print(y_train)

        X_test = features[features["source_url_normalized"].isin(urls["test"])]
        X_test = X_test.drop(['source_url_normalized', task, other_task], axis = 1)
        y_test = np.asarray(features[features["source_url_normalized"].isin(urls["test"])][task])

        clf = LabelPropagation()

        # train the classifier using the training data
        clf.fit(X_train, y_train)
        print(clf.score(X_test, y_test))

        # generate predictions
        pred = clf.predict(X_test)

        # generate probabilites
        prob = clf.predict_proba(X_test)
        #print(y_test)
        # cumulate the actual and predicted labels, and the probabilities over the different folds.  then, move the index
        actual[i: i + y_test.shape[0]] = y_test
        predicted[i: i + y_test.shape[0]] = pred
        i += y_test.shape[0]


    # calculate the performance metrics on the whole set of predictions (5 folds all together)
    f1, accuracy, flip_err, mae = calculate_metrics(actual, predicted)
    print('f1:', f1, 'accuracy:', accuracy, 'flip_err:', flip_err, 'mae:', mae)

In [None]:
# TODO normalize labels
from train import label2int

total_df['fact'] = total_df['fact'].map(label2int['fact'])
total_df['bias'] = total_df['bias'].map(label2int['bias'])

total_df.head()

In [None]:
train_model(splits, total_df, "fact")

In [None]:
train_model(splits, total_df, "bias")

In [None]:
labels = total_df[['fact', 'bias']]

In [None]:
features = total_df.drop(['fact', 'bias', 'source_url_normalized'], axis=1)

In [None]:
features.head()

# Label propagation on fact

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

features_train, features_test, labels_train, labels_test = train_test_split(features, labels['fact'], test_size=0.20)

# initialize
clf = LabelPropagation()

# train the classifier using the training data
clf.fit(features_train, labels_train)

# compute accuracy using test data
acc_test = clf.score(features_test, labels_test)

print ("Test Accuracy:", acc_test)
# Test Accuracy: 0.98

# Label propagation on bias

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

features_train, features_test, labels_train, labels_test = train_test_split(features, labels['bias'], test_size=0.20)

# initialize
clf = LabelPropagation()

# train the classifier using the training data
clf.fit(features_train, labels_train)

# compute accuracy using test data
acc_test = clf.score(features_test, labels_test)

print ("Test Accuracy:", acc_test)
# Test Accuracy: 0.98