# Source Prediction

In [1]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    classes = list(set(classes)) # classes[unique_labels(y_true, y_pred)]
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    # Only use the labels that appear in the data
    #classes = list(set(classes)) # classes[unique_labels(y_true, y_pred)]
    #if normalize:
        #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax



In [1]:
import pandas as pd

def load_data():
    article_table = pd.read_csv("../data/cache/webhose_pruned/webhose_pruned.csv")
    return article_table

In [2]:
articles = load_data()
articles

Unnamed: 0.1,Unnamed: 0,site,author,title,text
0,0,whio.com,,"Under GOP grilling, Clinton seeks to close boo...","> South Carolina Under GOP grilling, Clinton s..."
1,1,cnn.com,Eric Bradner,"Ben Carson's violent past: Bricks, bats, hammers",Story highlights Ben Carson reveals he wasn't ...
2,10,whio.com,,Justices hear arguments over life sentences fo...,"Updated: 2:02 p.m. Tuesday, Oct. 13, 2015 | Po..."
3,17,heraldscotland.com,,EU referendum fight hots up as Vote Leave laun...,EU referendum fight hots up as Vote Leave laun...
4,23,scoop.co.nz,,Spelling correction sought for Otago rivers,23 October 2015 \nSpelling correction sought f...
5,47,cbs8.com,cbs8,Cox Enterprises Signs American Business Act on...,Cox Enterprises\nCompany pledges to send zero...
6,49,washingtonexaminer.com,Daniel Chaitin,California becomes first state to ban 'Redskin...,The state of California has banned the use of ...
7,59,cbs8.com,cbs8,Lenox Advisors Holds Second Annual Corporate S...,"Lenox Advisors, Inc.\nNEW YORK , Oct. 23, 201..."
8,62,cbs8.com,cbs8,AP-GfK Poll: GOP says Trump tops list of elect...,By STEVE PEOPLES and EMILY SWANSON\nAssociated...
9,63,whio.com,,Obama defends Black Lives Matter movement,> Political Obama defends Black Lives Matter m...


In [4]:
articles.shape

(9113, 5)

## TFIDF


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = list(articles.text)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
tfidf_matrix = vectorizer.transform(corpus).todense().tolist()

## Back to our regular program

In [5]:
import json
with open("../data/cache/webhose_pruned_as_vectors/doc_as_vectors.json", 'r') as in_file:
    as_vectors = json.load(in_file)

In [6]:
ast = pd.DataFrame(as_vectors)
ast

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3799,3800,3801,3802,3803,3804,3805,3806,3807,3808
0,0.0,0.000,0.00,0.250,0.0,-0.125,-0.125,0.0,0.625,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,-0.250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000,0.00,0.000,0.0,0.375,0.000,0.0,0.000,0.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,-0.250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.000,0.00,0.000,0.0,0.000,0.125,0.0,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,0.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.000,0.00,0.000,0.0,0.000,0.000,0.0,0.000,-0.250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
articles.site.value_counts()

cbs8.com                  2260
usnews.com                1025
journal-news.com           886
whio.com                   716
washingtonexaminer.com     644
beforeitsnews.com          508
cnn.com                    482
heraldscotland.com         438
politico.com               391
dnaindia.com               357
deccanchronicle.com        319
voxy.co.nz                 307
scoop.co.nz                277
washingtonpost.com         272
nymag.com                  231
Name: site, dtype: int64

In [8]:
source_list = list(set(articles.site))
sources = articles.site
#sources_discrete = 

In [None]:
vc = articles.site.value_counts()
vc_greater = vc[vc > 100]

articles[articles.site.isin(list(vc_greater.index))].site.value_counts()

In [9]:
from sklearn.model_selection import train_test_split

X = ast
y = articles.site

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [9]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [10]:
clf.score(X_test, y_test)

0.4465167306637411

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [11]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42, multi_class='ovr') # one vs all
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
clf.score(X_test, y_test)

0.46242457487657707

In [13]:
%matplotlib qt
from sklearn.metrics import confusion_matrix
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
numpy.set_printoptions(threshold=5)
#cm = confusion_matrix(clf.predict(X_test), y_test)
predictions = clf.predict(X_test)
plot_confusion_matrix(y_test, predictions, list(articles.site))

[[ 6  1  0 ...  0 11  2]
 [ 1 49  0 ...  0  5  3]
 [ 0  2 12 ...  0  9  3]
 ...
 [ 0  2  0 ... 14  2  4]
 [ 3  3  0 ...  0 64  9]
 [ 3  2  0 ...  0 17 17]]


<matplotlib.axes._subplots.AxesSubplot at 0x7fbab0578390>

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)



In [15]:
clf.score(X_test, y_test)

0.2633022490400439

In [None]:
#predictions = clf.predict(X_test)

#y_test_list = list(y_test)

#for i in range(0, len(predictions)):
#    print(predictions[i], y_test_list[i])

## Balance the dataset?

https://beckernick.github.io/oversampling-modeling/

In [20]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

X = ast
y = articles.site

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

sm = SMOTE(random_state=13)
#sm = RandomOverSampler(random_state=13)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [16]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train_res,y_train_res)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
clf.score(X_test, y_test)

0.4476138233680746

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42, multi_class='ovr') # one vs all
clf.fit(X_train_res, y_train_res)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
clf.score(X_test, y_test)

0.4212835984640702

In [21]:
predictions = clf.predict(X_test)
plot_confusion_matrix(y_test, predictions, list(articles.site))

[[49 12  3 ... 13  7  6]
 [ 5 13  4 ... 10  7 15]
 [ 2  2 26 ...  1  0  2]
 ...
 [ 7  6  3 ... 39  4  9]
 [ 3  5  7 ...  1 51  4]
 [ 2  4  1 ...  5  2 14]]


<matplotlib.axes._subplots.AxesSubplot at 0x7f7ce5365320>