In [None]:
import sys
from goose3 import Goose
import unicodedata
import tldextract

sys.setrecursionlimit(2500)

def fix(text):
    try:
        text = text.decode("ascii", "ignore")
    except:
        text = text.strip()
        text = text.replace('\n', ' ')
        text = text.replace('\\', '')
        text = text.replace("\r", "")
        text = text.replace("\ufffd'", "")
    return text

def scrape(url):
    g = Goose()
    try:
        article = g.extract(url=url)
    except:
         return "Unexpected error when scraping", sys.exc_info()[0]
    text = fix(article.cleaned_text)
    title = fix(article.title)
    domain = tldextract.extract(url)[1]

    return title, text, domain

In [None]:
scrape("https://www.cnn.com/2019/11/22/politics/nunes-vienna-trip-ukrainian-prosecutor-biden/index.html")

In [None]:
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk import tokenize
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import collections
from nltk.stem.porter import *
from nltk import word_tokenize
from nltk.util import ngrams
import string
import sys
from resources.readability import Readability
import os
import pickle
import json
from importlib import reload

DEFAULT_PATH = os.getcwd()

def setup_path(*args):
    path = os.path.join(*args)
    if not os.path.isdir(path):
        os.makedirs(path)
    return path

def load_acl13_lexicons():
    with open(os.path.join(DEFAULT_PATH, "resources", "bias-lexicon.txt")) as lex:
        bias = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "assertives.txt")) as lex:
        assertives = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "factives.txt")) as lex:
        factives = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "hedges.txt")) as lex:
        hedges = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "implicatives.txt")) as lex:
        implicatives = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "report_verbs.txt")) as lex:
        report_verbs = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "negative-words.txt")) as lex:
        negative = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "positive-words.txt")) as lex:
        positive = set([l.strip() for l in lex])
    with open(os.path.join(DEFAULT_PATH, "resources", "subjclueslen.txt")) as lex:
        wneg = set([]);wpos = set([]);wneu = set([]);sneg = set([]);spos = set([]);sneu = set([])
        for line in lex:
            line = line.split()
            if line[0] == "type=weaksubj":
                if line[-1] == "priorpolarity=negative":
                    wneg.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=positive":
                    wpos.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=neutral":
                    wneu.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=both":
                    wneg.add(line[2].split("=")[1])
                    wpos.add(line[2].split("=")[1])
            elif line[0] == "type=strongsubj":
                if line[-1] == "priorpolarity=negative":
                    sneg.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=positive":
                    spos.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=neutral":
                    sneu.add(line[2].split("=")[1])
                elif line[-1] == "priorpolarity=both":
                    spos.add(line[2].split("=")[1])
                    sneg.add(line[2].split("=")[1])
    return bias, assertives, factives, hedges, implicatives, report_verbs, positive, negative, wneg, wpos, wneu, sneg, spos, sneu

def bias_lexicon_feats(text):
    bias, assertives, factives, hedges, implicatives, report_verbs, positive_op, negative_op, wneg, wpos, wneu, sneg, spos, sneu = load_acl13_lexicons()
    tokens = word_tokenize(text)
    bigrams = [" ".join(bg) for bg in ngrams(tokens, 2)]
    trigrams = [" ".join(tg) for tg in ngrams(tokens, 3)]
    bias_count = float(sum([tokens.count(b) for b in bias]))/len(tokens)
    assertives_count = float(sum([tokens.count(a) for a in assertives]))/len(tokens)
    factives_count = float(sum([tokens.count(f) for f in factives]))/len(tokens)
    hedges_count = sum([tokens.count(h) for h in hedges]) +  sum([bigrams.count(h) for h in hedges]) + sum([trigrams.count(h) for h in hedges])
    hedges_count = float(hedges_count)/len(tokens)
    implicatives_count = float(sum([tokens.count(i) for i in implicatives]))/len(tokens)
    report_verbs_count = float(sum([tokens.count(r) for r in report_verbs]))/len(tokens)
    positive_op_count = float(sum([tokens.count(p) for p in positive_op]))/len(tokens)
    negative_op_count = float(sum([tokens.count(n) for n in negative_op]))/len(tokens)
    wneg_count = float(sum([tokens.count(n) for n in wneg]))/len(tokens)
    wpos_count = float(sum([tokens.count(n) for n in wpos]))/len(tokens)
    wneu_count = float(sum([tokens.count(n) for n in wneu]))/len(tokens)
    sneg_count = float(sum([tokens.count(n) for n in sneg]))/len(tokens)
    spos_count = float(sum([tokens.count(n) for n in spos]))/len(tokens)
    sneu_count = float(sum([tokens.count(n) for n in sneu]))/len(tokens)
    return bias_count, assertives_count, factives_count, hedges_count, implicatives_count, report_verbs_count, positive_op_count, negative_op_count, wneg_count, wpos_count, wneu_count, sneg_count, spos_count, sneu_count

def ttr(text):
    words = text.split()
    dif_words = len(set(words))
    tot_words = len(words)
    if tot_words == 0:
        return 0
    return (float(dif_words)/tot_words)

def POS_features(fn, text, outpath):
    fname = os.path.join(outpath, fn.split(".")[0]+"_tagged.txt")

    pos_tags = ["CC","CD","DT","EX","FW","IN","JJ","JJR","JJS","LS","MD","NN","NNS","NNP","NNPS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SYM","TO","UH","WP$","WRB","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP"]
    sents = tokenize.sent_tokenize(text)
    counts_norm = []
    allwords = []
    sents = tokenize.sent_tokenize(text)

    with open(fname, "w") as out:
        for sent in sents:
            words = sent.strip(".").split()
            tags = nltk.pos_tag(words)
            strtags = ["/".join((wt[0],wt[1])) for wt in tags]
            out.write("".join(strtags)+" ")

    with open(fname, "r") as fl:
        line = fl.readline() #each file is one line
        
    print(line)

    wordandtag = line.strip().split()
    tags = [wt.split("/")[1] for wt in wordandtag]
    counts = collections.Counter(tags)

    for pt in pos_tags:
        try:
            counts_norm.append(float(counts[pt])/len(tags))
        except:
            counts_norm.append(0)

    return counts_norm

def vadersent(text):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    return vs['neg'], vs['neu'], vs['pos']

def readability(text):
    rd = Readability(text)
    fkg_score = rd.FleschKincaidGradeLevel()
    SMOG = rd.SMOGIndex()
    return fkg_score, SMOG

def wordlen_and_stop(text):
    with open(os.path.join(DEFAULT_PATH, "resources", "stopwords.txt")) as data:
        stopwords = [w.strip() for w in data]
    set(stopwords)
    words = word_tokenize(text)
    WC = len(words)
    stopwords_in_text = [s for s in words if s in stopwords]
    percent_sws = float(len(stopwords_in_text))/len(words)
    lengths = [len(w) for w in words if w not in stopwords]
    if len(lengths) == 0:
        word_len_avg = 3
    else:
        word_len_avg = float(sum(lengths))/len(lengths)
    return percent_sws, word_len_avg, WC

def stuff_LIWC_leftout(text):
    puncs = set(string.punctuation)
    tokens = word_tokenize(text)
    quotes = tokens.count("\"")+tokens.count('``')+tokens.count("''")
    Exclaim = tokens.count("!")
    AllPunc = 0
    for p in puncs:
        AllPunc+=tokens.count(p)
    words_upper = 0
    for w in tokens:
        if w.isupper():
            words_upper+=1
    allcaps = float(words_upper)/len(tokens)
    return (float(quotes)/len(tokens))*100, (float(Exclaim)/len(tokens))*100, (float(AllPunc)/len(tokens))*100, allcaps

def subjectivity(text):
    loaded_model = pickle.load(open(os.path.join(DEFAULT_PATH, 'resources', 'NB_Subj_Model.sav'), 'rb'), encoding = 'latin1')
    count_vect = pickle.load(open(os.path.join(DEFAULT_PATH, 'resources', 'count_vect.sav'), 'rb'), encoding = 'latin1')
    tfidf_transformer = pickle.load(open(os.path.join(DEFAULT_PATH, 'resources', 'tfidf_transformer.sav'), 'rb'), encoding = 'latin1')
    X_new_counts = count_vect.transform([text])
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    result = loaded_model.predict_proba(X_new_tfidf)
    prob_obj = result[0][0]
    prob_subj = result[0][1]
    return prob_obj, prob_subj

def load_LIWC_dictionaries(filepath=os.path.join(DEFAULT_PATH, "resources")):
    cat_dict = {}
    stem_dict = {}
    counts_dict = {}
    with open(os.path.join(filepath, "LIWC2007_English100131.dic")) as raw:
        raw.readline()
        for line in raw:
            if line.strip() == "%":
                break
            line = line.strip().split()
            cat_dict[line[0]] = line[1]
            counts_dict[line[0]] = 0
        for line in raw:
            line = line.strip().split()
            stem_dict[line[0]] = [l.replace("*", "") for l in line[1:]]
    return cat_dict, stem_dict, counts_dict


def LIWC(text, cat_dict, stem_dict, counts_dict):
    for key in counts_dict:
        counts_dict[key] = 0
    tokens = word_tokenize(text)
    WC = len(tokens)
    stemmer = PorterStemmer()
    stemed_tokens = [stemmer.stem(t) for t in tokens]

    # count and percentage
    for stem in stem_dict:
        count = stemed_tokens.count(stem.replace("*", ""))
        if count > 0:
            for cat in stem_dict[stem]:
                counts_dict[cat] += count
    counts_norm = [float(counts_dict[cat]) / WC * 100 for cat in counts_dict]
    cats = [cat_dict[cat] for cat in cat_dict]
    return counts_norm, cats

def source_encoding(source):
    #load source dictionary
    with open(os.path.join(DEFAULT_PATH, "resources", "source_codes.json")) as j:
        already_encoded_domains = json.loads(j.readline()) #first line is dictionary
        last_code = json.loads(j.readline())[0] #second line is array with single value of last code
    #get source encoding feature
    if source in already_encoded_domains.keys():
        return already_encoded_domains[source]
    else:
        return last_code+1

#def fix(text): # this is just my frustating attempt to fix unicode junk
 #   text = text.replace("\r", "").replace('\n', "").replace("\ufffd'", "")
  #  text = text.decode("ascii", "ignore")
   # return text

def make_str(seq):
    strseq = [str(s) for s in seq]
    return strseq

#main

def start(title_text, text, source, outpath):
    # Setup path function will create the output directory if it does not exist
    pos_features_path = setup_path(outpath, "pos_tagged_files")

    cat_dict, stem_dict, counts_dict = load_LIWC_dictionaries()
    liwc_cats = [cat_dict[cat] for cat in cat_dict]
    pos_tags = ["CC","CD","DT","EX","FW","IN","JJ","JJR","JJS","LS","MD","NN","NNS","NNP","NNPS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SYM","TO","UH","WP$","WRB","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP"]
    pos_tags_titles = [t+"_title" for t in pos_tags]
    liwc_cats_title = [t+"_title" for t in liwc_cats]
   
    if len(text) == 0:
        raise ValueError("No Text")

    pid = 1

    #source
    source_code = source_encoding(source)
    
    #body
    quotes, Exclaim, AllPunc, allcaps = stuff_LIWC_leftout(text)
    lex_div = ttr(text)
    counts_norm = POS_features("input", text, pos_features_path)
    counts_norm = [str(c) for c in counts_norm]
    counts_norm_liwc, liwc_cats = LIWC(text, cat_dict, stem_dict, counts_dict)
    counts_norm_liwc = [str(c) for c in counts_norm_liwc]
    vadneg, vadneu, vadpos = vadersent(text)
    fke, SMOG = readability(text)
    stop, wordlen, WC = wordlen_and_stop(text)
    NB_pobj, NB_psubj = subjectivity(text)
    bias_count, assertives_count, factives_count, hedges_count, implicatives_count, report_verbs_count, positive_op_count, negative_op_count, wneg_count, wpos_count, wneu_count, sneg_count, spos_count, sneu_count = bias_lexicon_feats(text)

    #title
    quotes_titles, Exclaim_titles, AllPunc_titles, allcaps_titles = stuff_LIWC_leftout(title_text)
    lex_div_title = ttr(title_text)
    counts_norm_title = POS_features("input_title", title_text, pos_features_path)
    counts_norm_title = [str(c) for c in counts_norm]
    counts_norm_liwc_title, liwc_cats_title2 = LIWC(title_text, cat_dict, stem_dict, counts_dict)
    counts_norm_liwc_title = [str(c) for c in counts_norm_liwc_title]
    vadneg_title, vadneu_title, vadpos_title = vadersent(title_text)
    fke_title, SMOG_title = readability(title_text)
    stop_title, wordlen_title, WC_title = wordlen_and_stop(title_text)
    NB_pobj_title, NB_psubj_title = subjectivity(title_text)
    bias_count_title, assertives_count_title, factives_count_title, hedges_count_title, implicatives_count_title, report_verbs_count_title, positive_op_count_title, negative_op_count_title, wneg_count_title, wpos_count_title, wneu_count_title, sneg_count_title, spos_count_title, sneu_count_title = bias_lexicon_feats(title_text)

    with open(os.path.join(outpath, "all_features.csv"), "w") as out:
        seq = ("pid,source_code,bias_count,assertives_count,factives_count,hedges_count,implicatives_count,report_verbs_count,positive_op_count,negative_op_count,wneg_count,wpos_count,wneu_count,sneg_count,spos_count,sneu_count,TTR,vad_neg,vad_neu,vad_pos,FKE,SMOG,stop,wordlen,WC,NB_pobj,NB_psubj,quotes,Exclaim,AllPunc,allcaps",",".join(pos_tags),",".join(liwc_cats), "TTR_title,vad_neg_title,vad_neu_title,vad_pos_title,FKE_title,SMOG_title,stop_title,wordlen_title,WC_title,NB_pobj_title,NB_psubj_title,quotes_title,Exclaim_title,AllPunc_title,allcaps_title",",".join(pos_tags_titles),",".join(liwc_cats_title),"bias_count_title,assertives_count_title,factives_count_title,hedges_count_title,implicatives_count_title,report_verbs_count_title,positive_op_count_title,negative_op_count_title,wneg_count_title,wpos_count_title,wneu_count_title,sneg_count_title,spos_count_title,sneu_count_title")
        out.write(",".join(seq)+"\n")
        seq = (pid, source_code, bias_count, assertives_count, factives_count, hedges_count, implicatives_count, report_verbs_count, positive_op_count, negative_op_count, wneg_count, wpos_count, wneu_count, sneg_count, spos_count, sneu_count, lex_div,vadneg,vadneu,vadpos,fke,SMOG,stop,wordlen,WC,NB_pobj,NB_psubj,quotes,Exclaim,AllPunc,allcaps, ",".join(counts_norm), ",".join(counts_norm_liwc),lex_div_title,vadneg_title,vadneu_title,vadpos_title,fke_title,SMOG_title,stop_title,wordlen_title,WC_title,NB_pobj_title,NB_psubj_title,quotes_titles,Exclaim_titles,AllPunc_titles,allcaps_titles, ",".join(counts_norm_title), ",".join(counts_norm_liwc_title),bias_count_title, assertives_count_title, factives_count_title, hedges_count_title, implicatives_count_title, report_verbs_count_title, positive_op_count_title, negative_op_count_title, wneg_count_title, wpos_count_title, wneu_count_title, sneg_count_title, spos_count_title, sneu_count_title)
        seq = make_str(seq)
        feat_str = ",".join(seq)
        out.write(feat_str + "\n")

In [None]:
import time
starttime = time.time()
def timeee():
    x = scrape("https://www.cnn.com/2019/11/22/politics/nunes-vienna-trip-ukrainian-prosecutor-biden/index.html")
    start(x[0], x[1], x[2], os.getcwd() + r'\out')
timeee()
end = time.time()
starttime - end

In [None]:
import pandas as pd
df = pd.read_csv('NELA.csv')

In [None]:
df2 = pd.read_csv('all_features.csv')

In [None]:
cols = df2.columns
cols = [x.strip() for x in cols]
notin = []
i = 0
for col in df.columns:
    if col.strip() not in cols:
        i += 1
        print(col)
        notin.append(col)
print(i)

In [None]:
df.drop([' source', ' date'], axis = 1, inplace = True)

In [None]:
notin.remove(' source')

In [None]:
notin.remove(' date')

In [None]:
for col in notin:
    print(df.columns.get_loc(col))

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\User\Documents\HackHarvard\file.json"

In [40]:
from googleapiclient.discovery import build
import pprint
my_api_key = "AIzaSyB5j2lTK4tVJsPTWmwXJ5ypn1Hojjh-RdU"
my_cse_id = "011673657838978336298:dqh43ak8nfj"

def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res

results = google_search(
    'poo', my_api_key, my_cse_id, num=10)

In [57]:
results

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&relatedSite={relatedSite?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - poo',
    'totalResults': '53100000',
    'searchTerms': 'poo',
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'utf8',
    'safe

In [46]:
x = []
x.append(results['items'])

In [56]:
for res in x:
    for r in res:
        print(r['link'])

https://www.poopourri.com/
https://en.wiktionary.org/wiki/poo
https://www.urbandictionary.com/define.php?term=poo
https://www.merriam-webster.com/dictionary/poo
https://www.theguardian.com/science/2019/nov/18/scientists-develop-slippery-toilet-coating-stop-poo-sticking
https://www.youtube.com/watch?v=KlUQ4MJvSEQ
https://www.macfound.org/fellows/924/
https://www.youtube.com/watch?v=8lEMUl0-MZI
https://www.reuters.com/article/us-philippines-poop-bricks/philippine-students-turn-littered-dog-poo-into-bricks-idUSKBN1XU152
https://www.wta.org/go-hiking/hikes/poo-poo-point


In [50]:
for res in x:
    print(res['link'])

TypeError: list indices must be integers or slices, not str

In [None]:
res['queries']['nextPage']

In [None]:
res['items'][0]['link']

In [None]:
def get_urls(query, numResults):
    my_api_key = "AIzaSyB5j2lTK4tVJsPTWmwXJ5ypn1Hojjh-RdU"
    my_cse_id = "011673657838978336298:dqh43ak8nfj"
    
    service = build("customsearch", "v1", developerKey = my_api_key)
    urls = []
    for i in range(0, numResults, 10):
        numres = min(numResults - i, 10)
        results = service.cse().list(q = query, cx = my_cse_id, start = i + 1, num = numres).execute()
        for res in results['items']:
            urls.append(res['link'])
    return urls

In [None]:
def search(query, numResults):
    urls = get_urls(query, numResults)
    for url in urls:
        scrape(url)

In [None]:
import app

In [None]:
pos_tags = ["CC","CD","DT","EX","FW","IN","JJ","JJR","JJS","LS","MD","NN","NNS","NNP","NNPS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SYM","TO","UH","WP$","WRB","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP"]
liwc_cats = ['funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', 'filler']
pos_tags_titles = [t+"_title" for t in pos_tags]
liwc_cats_title = [t+"_title" for t in liwc_cats]
seq = ("pid,source_code,bias_count,assertives_count,factives_count,hedges_count,implicatives_count,report_verbs_count,positive_op_count,negative_op_count,wneg_count, wpos_count,wneu_count,sneg_count,spos_count,sneu_count,TTR,vad_neg,vad_neu,vad_pos,FKE,SMOG,stop,wordlen,WC,NB_pobj,NB_psubj,quotes,Exclaim,AllPunc,allcaps",",".join(pos_tags),",".join(liwc_cats),"TTR_title,vad_neg_title,vad_neu_title,vad_pos_title,FKE_title,SMOG_title,stop_title,wordlen_title,WC_title,NB_pobj_title,NB_psubj_title,quotes_title,Exclaim_title,AllPunc_title,allcaps_title",",".join(pos_tags_titles),",".join(liwc_cats_title),"bias_count_title,assertives_count_title,factives_count_title,hedges_count_title,implicatives_count_title,report_verbs_count_title,positive_op_count_title,negative_op_count_title,wneg_count_title,wpos_count_title,wneu_count_title,sneg_count_title,spos_count_title,sneu_count_title")
x = ','.join(seq).split(',')

In [None]:
len(x)

In [None]:
import pandas as pd
df = pd.DataFrame(columns = x)
df.head()

In [None]:
import numpy as np
arr = np.ones([1, 260])

In [None]:
df.loc[0] = arr

In [None]:
arr = np.append(arr, arr, axis = 0)
np.shape(arr)

In [None]:
help(np.append)

In [None]:
import numpy as np
x = []
x.append([1, 2, 3])

In [None]:
x.append([4, 5, 6])
np.shape(x)

In [None]:
np.array(x)

In [None]:
np.shape(np.ndarray([1, 2]))

In [None]:
np.shape([[1, 2]])

In [1]:
import pandas as pd
df = pd.read_csv('nelalabelled.csv')
df.head()

Unnamed: 0,Happiness,HarmVirtue,HarmVice,FairnessVirtue,FairnessVice,IngroupVirtue,IngroupVice,AuthorityVirtue,AuthorityVice,PurityVirtue,...,FairnessVirtue_title,FairnessVice_title,IngroupVirtue_title,IngroupVice_title,AuthorityVirtue_title,AuthorityVice_title,PurityVirtue_title,PurityVice_title,MoralityGeneral_title,bias
0,5.266154,0.012987,0.0,0.0,0.0,0.004329,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,4.71,0.004396,0.0,0.0,0.0,0.006593,0.0,0.004396,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,5.347143,0.0,0.0,0.0,0.0,0.003215,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,5.39,0.0,0.00996,0.0,0.0,0.0,0.003984,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,5.425455,0.0,0.0,0.0,0.0,0.003984,0.003984,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [2]:
df.drop([" Happiness"," HarmVirtue"," HarmVice"," FairnessVirtue"," FairnessVice"," IngroupVirtue"," IngroupVice"," AuthorityVirtue"," AuthorityVice"," PurityVirtue"," PurityVice"," MoralityGeneral"," HarmVirtue_title"," HarmVice_title"," FairnessVirtue_title"," FairnessVice_title"," IngroupVirtue_title"," IngroupVice_title"," AuthorityVirtue_title"," AuthorityVice_title"," PurityVirtue_title"," PurityVice_title"," MoralityGeneral_title"], axis = 1, inplace = True)

In [3]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 42)

In [4]:
labels = df['bias']

In [5]:
features = df.drop(['bias'], axis = 1)

In [58]:
features

Unnamed: 0,bias_count,assertives_count,factives_count,hedges_count,implicatives_count,report_verbs_count,positive_op_count,negative_op_count,wneg_count,wpos_count,...,implicatives_count_title,report_verbs_count_title,positive_op_count_title,negative_op_count_title,wneg_count_title,wpos_count_title,wneu_count_title,sneg_count_title,spos_count_title,sneu_count_title
0,0.099567,0.004329,0.008658,0.008658,0.017316,0.008658,0.004329,0.017316,0.004329,0.034632,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.032967,0.006593,0.004396,0.008791,0.002198,0.019780,0.008791,0.021978,0.017582,0.013187,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.061093,0.012862,0.003215,0.016077,0.019293,0.016077,0.019293,0.035370,0.022508,0.022508,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.071713,0.001992,0.001992,0.009960,0.005976,0.013944,0.011952,0.039841,0.023904,0.019920,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.067729,0.007968,0.000000,0.019920,0.003984,0.003984,0.015936,0.019920,0.000000,0.027888,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
5,0.071685,0.014337,0.000000,0.021505,0.000000,0.017921,0.007168,0.064516,0.028674,0.010753,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
6,0.133929,0.002976,0.000000,0.035714,0.000000,0.008929,0.035714,0.056548,0.032738,0.056548,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
7,0.082816,0.012422,0.004141,0.012422,0.006211,0.022774,0.012422,0.031056,0.024845,0.010352,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
8,0.057816,0.000000,0.000000,0.008565,0.000000,0.010707,0.051392,0.004283,0.004283,0.062099,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
9,0.099757,0.004866,0.000000,0.021898,0.002433,0.012165,0.009732,0.031630,0.014599,0.017032,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [6]:
clf.fit(features, labels)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [7]:
import pickle
filename = 'defaultRF.sav'
pickle.dump(clf, open(filename, 'wb'))

In [8]:
test = features.loc[[4, 100, 200]]

In [9]:
urls = ["https://www.cnn.com/2019/11/22/politics/nunes-vienna-trip-ukrainian-prosecutor-biden/index.html", "https://www.theguardian.com/us-news/2019/nov/23/trump-impeachment-released-documents-reveal-giuliani-pompeo-links", "https://www.bbc.com/news/world-us-canada-39945744"]

In [10]:
preds = clf.predict_proba(test)[:, 1]

In [16]:
import numpy as np
ix = np.argsort(preds)
print(preds, ix)

[0.  0.8 0.1] [0 2 1]


In [17]:
preds_sorted = preds[ix]
preds_sorted

array([0. , 0.1, 0.8])

In [None]:
preds

In [13]:
np.argsort([1, 2, 3])

array([0, 1, 2], dtype=int64)

In [15]:
preds * 10

array([0., 8., 1.])

In [18]:
urls[ix]

TypeError: only integer scalar arrays can be converted to a scalar index

In [24]:
np.array(urls)[::-1]

array(['https://www.bbc.com/news/world-us-canada-39945744',
       'https://www.theguardian.com/us-news/2019/nov/23/trump-impeachment-released-documents-reveal-giuliani-pompeo-links',
       'https://www.cnn.com/2019/11/22/politics/nunes-vienna-trip-ukrainian-prosecutor-biden/index.html'],
      dtype='<U113')

In [102]:
x = [1, 2]
y = [4, 5]

In [103]:
x = np.array(x)
y = np.array(y)

In [104]:
x = np.c_[x, y]
x

array([[1, 4],
       [2, 5]])

In [88]:
help(np.c_)

Help on CClass in module numpy.lib.index_tricks object:

class CClass(AxisConcatenator)
 |  Translates slice objects to concatenation along the second axis.
 |  
 |  This is short-hand for ``np.r_['-1,2,0', index expression]``, which is
 |  useful because of its common occurrence. In particular, arrays will be
 |  stacked along their last axis after being upgraded to at least 2-D with
 |  1's post-pended to the shape (column vectors made out of 1-D arrays).
 |  
 |  See Also
 |  --------
 |  column_stack : Stack 1-D arrays as columns into a 2-D array.
 |  r_ : For more detailed documentation.
 |  
 |  Examples
 |  --------
 |  >>> np.c_[np.array([1,2,3]), np.array([4,5,6])]
 |  array([[1, 4],
 |         [2, 5],
 |         [3, 6]])
 |  >>> np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]
 |  array([[1, 2, 3, 0, 0, 4, 5, 6]])
 |  
 |  Method resolution order:
 |      CClass
 |      AxisConcatenator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |    

In [66]:
for a in x:
    print(a)

[1, 2]
[4, 5]


In [67]:
x = [1, 2]
x = [x]
x

[[1, 2]]