In [1]:
import csv

In [5]:
from sklearn.model_selection import train_test_split

In [2]:
import torch

# verify if GPU is available
if torch.cuda.is_available(): 
    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
# initialize variables
import pandas as pd
import mysql.connector
import os
from dotenv import load_dotenv

load_dotenv(override=True)

# mysql credentials
PASSWORD = os.getenv("PASSWORD")
USER = os.getenv("USER")


load_dotenv(override=True)


True

In [4]:
# connect to db
import mysql.connector

mydb = mysql.connector.connect(
  host="127.0.0.1",
  user=USER,
  password=PASSWORD,
  database="mpp21"
)

mycursor = mydb.cursor()

In [9]:
article_ids = {}

with open('./privacy_filter/labels.csv', 'r') as f:
        reader = csv.reader(f)
        for r in reader:
            article_ids[r[0]] = r[1]

In [10]:
# TEST IF ONLY ABSTRACT
# db methods
def select_article(curs, db, aid):
    
    curs.execute("""SELECT title, content 
                FROM articles 
                WHERE article_id = '""" + aid + """' 
                """)
                #LIMIT 200""") 
    
    arts = {}
    result = curs.fetchall()
    for r in result:
        arts[aid] = {
            "title": r[0],
            "content": r[1],
            "verdict": article_ids[aid]
        }
    
    return arts[aid]

In [38]:
contents = {}

for a in article_ids.keys():
    temp = select_article(mycursor, mydb, a)
    
    contents[a] = {
        "id": a,
        "label": int(temp["verdict"]),
        "text": temp["title"] + " " + temp["content"]
    }


In [39]:
df = pd.DataFrame.from_dict(contents).T

In [40]:
df.sample(10)

Unnamed: 0,id,label,text
TS_1909,TS_1909,0,Erotic novels growing ebook sales If you find ...
DT_1508,DT_1508,0,"For sale at £300m, Britain's priciest house WI..."
GM_1513,GM_1513,0,City home at the hub of the community;Aaron Le...
TS_1940,TS_1940,0,Glee's Cory Monteith found dead;Foul play unli...
USA_1583,USA_1583,1,Apple: 'Founders would be appalled';Tech giant...
SMH_1931,SMH_1931,0,Punter wants return of 'proceeds of crime' WIT...
NYT_1441,NYT_1441,1,WikiLeaks tells of aid to leaker in U.S. inqui...
AFR_1702,AFR_1702,0,'I didn't want to be a battery hen';Co-working...
DT_1893,DT_1893,1,Gmail users 'can't expect privacy' GOOGLE is o...
SMH_1881,SMH_1881,0,One-bedder is $750 a night but star rated Who ...


In [41]:
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

Number of training sentences: 571



In [5]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [6]:
from keras.preprocessing.sequence import pad_sequences


In [44]:
def prepare_data(data): # a pandas frame
    
    sentences = data.text.values
    labels = data.label.values
    
    input_ids = []

    for sent in sentences:
        encoded_sent = tokenizer.encode(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 512,
                       )


        input_ids.append(encoded_sent)

    #print("Sample encoding:")
    #print('Original: ', sentences[0])
    #print('Token IDs:', input_ids[0])

    #print()
    #print('Max sentence length: ', max([len(sen) for sen in input_ids]))
    
    
    MAX_LEN = 512

    #print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
    #print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")
    #print('Done.')
    
    attention_masks = []

    for sent in input_ids:

        att_mask = [int(token_id > 0) for token_id in sent]

        attention_masks.append(att_mask)
    
    return input_ids, labels, attention_masks

In [45]:
# split into train and test sets (0.8 training, 0.2 test)
def split_data(inputs, labels, masks, split):

    train_data, test_data, train_labels, test_labels = train_test_split(inputs, labels, 
                                                                random_state=2018, test_size=split)
    # split masks
    train_masks, test_masks, _, _ = train_test_split(masks, labels,
                                                 random_state=2018, test_size=split)
    
    return train_data, test_data, train_labels, test_labels, train_masks, test_masks

In [46]:
input_ids, labels, attention_masks = prepare_data(df)

In [47]:
# split data into train and test
train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = split_data(input_ids, labels, attention_masks, 0.2)

# split train into train and validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = split_data(train_inputs, train_labels, train_masks, 0.1)


In [48]:
# cast labels
import numpy as np

x, y, z = [], [], []

[x.append(i) for i in train_labels]
[y.append(i) for i in validation_labels]
[z.append(i) for i in test_labels]

train_labels = np.array(x)
validation_labels = np.array(y)
test_labels = np.array(z)

In [49]:
# convert inputs and labels into torch tensors
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

In [50]:
print("train", len(train_inputs))
print("validation", len(validation_inputs))
print("test", len(test_inputs))

train 410
validation 46
test 115


In [93]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16 # 32

# DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# DataLoader for validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# DataLoader for the test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [94]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# load BertForSequenceClassification (pre-trained BERT model with a single linear classif. layer)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # 12-layer BERT model, uncased vocab (TODO: uncase articles)
    # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # number of output labels
                        # 2: binary classification
                        # >2: multi-class   
    output_attentions = False, # return attention weights
    output_hidden_states = False, # return hidden states
)

# run on GPU (uncomment if GPU available on server)
# model.cuda()

In [2]:
# get model's parameters
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

NameError: name 'model' is not defined

In [3]:
# AdamW 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5,
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup

# training epochs: recommended 2-4
epochs = 10 #test  10 # 

# training steps = batches * epochs
total_steps = len(train_dataloader) * epochs

# create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            # default value in run_glue.py
                                            num_training_steps = total_steps)

NameError: name 'AdamW' is not defined

In [97]:
# compute accuracy of predictions
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [98]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
   
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [99]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# store average loss after each epoch
loss_values = []


models = {}
stats = {}


for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # full pass over the training set    
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')    
    t0 = time.time() # measure epoch length
    total_loss = 0 # reset total loss for current epoch
    
    # place model in training mode
    model.train()    

    # for each batch
    for step, batch in enumerate(train_dataloader):        
        #print("Step: ", step)
        # updated on progress every 40 batches
        if step % 10 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0) # compute elapsed time
            
            # report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))        
            
        # unpack training batch from DataLoader
        # copy each tensor to GPU ('to' method)
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)        
        
        # clear previously calculated gradients before a backward pass
        model.zero_grad()                
        
        # forward pass (i.e., evaluate the model on this training batch)
        # this returns the loss (and not the model output), because we provided the `labels`

        # documentation for model function:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # get loss value
        loss = outputs[0]        
        
        # accumulate training loss over all batches (to compute average loss at the end)
        total_loss += loss.item()     
        
        # backward pass to calculate gradients
        loss.backward()        
        
        # clip the norm of the gradients to 1.0 to prevent the 'exploding gradients' problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)        
        
        # update parameters (the optimizer dictates the update rule based on gradients, learning rate, etc.)
        # take a step using the computed gradient
        optimizer.step() 
        
        # update the learning rate.
        scheduler.step()    
    
    # calculate average loss over training data
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # store the loss value to plot the learning curve
    loss_values.append(avg_train_loss)    
    
    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    

    # after each training epoch, measure performance on validation set
    print()
    print("Running Validation...")    
    t0 = time.time()   
    
    ### STORE MODEL
    models[epoch_i] = model
    fn = "models/model_e" + str(epoch_i) + ".sav"
    pickle.dump(model, open(fn, 'wb'))
    
    # place model in evaluation mode (dropout layers behave differently than during training)
    model.eval()    
    
    # tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0    
    
    # evaluate data for one epoch
    for batch in validation_dataloader:
        
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # unpack inputs from DataLoader
        b_input_ids, b_input_mask, b_labels = batch
        
        
        # do not compute or store gradients to save memory and speedup validation
        with torch.no_grad():                    
            # forward pass, calculate logit predictions
                # will return logits (not the loss, because we have not provided labels)
            # token_type_ids = segment ids (differentiates between sentence 1 and 2 in 2-sentence tasks)
            
            
            # documentation for model
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # get logits (output) from model
        # values prior to applying an activation function (e.g., softmax)
        logits = outputs[0]    
        
        # move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # compute accuracy for current batch 
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # accumulate accuracy
        eval_accuracy += tmp_eval_accuracy        
        # track batches
        nb_eval_steps += 1    
        # report final accuracy for current validation run
    print("  Accuracy: {0:.5f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    
    stats[epoch_i] = {
        "acc": round(eval_accuracy/nb_eval_steps, 5),
        "avg-loss": round(avg_train_loss, 5)
    }
    print("")
print("Training complete!")


Training...
  Batch    10  of     26.    Elapsed: 0:03:01.
  Batch    20  of     26.    Elapsed: 0:06:07.

  Average training loss: 0.58
  Training epoch took: 0:07:51

Running Validation...
  Accuracy: 0.74405
  Validation took: 0:00:14


Training...
  Batch    10  of     26.    Elapsed: 0:03:06.
  Batch    20  of     26.    Elapsed: 0:06:11.

  Average training loss: 0.38
  Training epoch took: 0:07:56

Running Validation...
  Accuracy: 0.78571
  Validation took: 0:00:14


Training...
  Batch    10  of     26.    Elapsed: 0:03:03.
  Batch    20  of     26.    Elapsed: 0:06:09.

  Average training loss: 0.24
  Training epoch took: 0:07:54

Running Validation...
  Accuracy: 0.80655
  Validation took: 0:00:14


Training...
  Batch    10  of     26.    Elapsed: 0:03:04.
  Batch    20  of     26.    Elapsed: 0:06:09.

  Average training loss: 0.15
  Training epoch took: 0:07:53

Running Validation...
  Accuracy: 0.83036
  Validation took: 0:00:15


Training...
  Batch    10  of     26.  

In [100]:
stats

{0: {'acc': 0.74405, 'avg-loss': 0.57587},
 1: {'acc': 0.78571, 'avg-loss': 0.37848},
 2: {'acc': 0.80655, 'avg-loss': 0.24405},
 3: {'acc': 0.83036, 'avg-loss': 0.14597},
 4: {'acc': 0.80952, 'avg-loss': 0.09608},
 5: {'acc': 0.80952, 'avg-loss': 0.05703},
 6: {'acc': 0.80952, 'avg-loss': 0.04228},
 7: {'acc': 0.80952, 'avg-loss': 0.03855},
 8: {'acc': 0.78571, 'avg-loss': 0.03685},
 9: {'acc': 0.80952, 'avg-loss': 0.03845}}

In [92]:
# import pickle

# # save models to disk

# for m in models.keys():
#     fn = "models/model_e" + str(m) + ".sav"
#     pickle.dump(models[m], open(fn, 'wb'))
 

In [103]:
#stats

def pick_best_model(scores):
    
    max_acc = 0
    m = -1
    
    # for all scores
    for s in scores.keys():
        if scores[s]["acc"] > max_acc:
            max_acc = scores[s]["acc"]
            m = s
    return m

In [7]:
import pickle
def load_model(m):
    return pickle.load(open("models/model_e" + str(m) + ".sav", 'rb'))

In [8]:
model = load_model(9)#pick_best_model(stats))

In [122]:
# prediction on test set
print('Predicting labels for {:,} test documents...'.format(len(test_inputs)))

# place model in evaluation mode
model.eval()

# tracking variables 
predictions, true_labels = [], []

# predict 
for batch in test_dataloader:
    # add batch to GPU (if available)
    batch = tuple(t.to(device) for t in batch)
  
    # unpack inputs from DataLoader
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        # forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask) 
        
    logits = outputs[0]  
    
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
print('\nDONE.')

Predicting labels for 115 test documents...

DONE.


In [123]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []

# evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# for each batch
for i in range(len(true_labels)):
    # predictions are a 2-column ndarray (one for 0, one for 1)
    # pick label with highest value and turn it into 1 or 0

    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
    # calculate and store coef for current batch
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [124]:
# combine the predictions for each batch into a single list of 0s and 1s

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# combine the correct labels for each batch into a single list
flat_true_labels = [item for sublist in true_labels for item in sublist]

# calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

MCC: 0.836


In [125]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [126]:
def get_conf_matrix(x, y):
    return confusion_matrix(np.array(x), y).ravel()

def get_recall(x, y):
    return recall_score(x, y, average='macro')
    
def get_precision(x, y):
    return precision_score(x, y, average='macro')

def get_f1_score(x, y):
    return f1_score(x, y, average='macro')

In [130]:
x_data = flat_true_labels
y_data = flat_predictions

tn, fp, fn, tp = get_conf_matrix(x_data, y_data)#confusion_matrix(np.array(flat_true_labels), flat_predictions).ravel()
print(
    "\nTN:", tn,
    "\nTP:", tp,
    "\nFN:", fn,
    "\nFP:", fp    
)

recall = round(get_recall(x_data, y_data), 3)
precision = round(get_precision(x_data, y_data), 3)
f1 = round(get_f1_score(x_data, y_data), 3)

print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)


TN: 62 
TP: 43 
FN: 0 
FP: 10
Recall: 0.931
Precision: 0.906
F1-score: 0.911


In [128]:
# save final mode; to disk

# for m in models.keys():
#     fn = "model_e" + str(m) + ".sav"
#     pickle.dump(models[m], open(fn, 'wb'))

# PREDICT

In [9]:
def select_articles_for_prediction(curs, db, n):
    
    curs.execute("""SELECT article_id, title, content 
                FROM articles 
                WHERE newspaper = '""" + n + """' 
                AND year(DATE) != '2010'
                AND is_privacy != 'duplicate'
                """)
                #LIMIT 200""") 
    
    arts = {}
    result = curs.fetchall()
    for r in result:
        arts[r[0]] = {
            "title": r[1],
            "content": r[2]
        }
    
    return arts

In [10]:
# function to get privacy label
def is_privacy(d):
    
    # tokenize
    encoded_sent = tokenizer.encode(d, add_special_tokens = True, max_length = 512)
    
    # pad
    input_id = pad_sequences([encoded_sent], maxlen=512, 
                          dtype="long", truncating="post", padding="post")
    #print(input_id)
    
    # create attention mask
    seq_mask = [float(i>0) for i in input_id[0]]

    # convert to tensors
    prediction_input = torch.tensor(input_id)
    prediction_mask = torch.tensor([seq_mask])
    output = MODEL(prediction_input, token_type_ids=None, 
                      attention_mask=prediction_mask) 
        
    logits = output[0]  
    
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    
    # store predictions and true labels
    verdict = np.argmax(logits[0])

    return int(verdict)


In [11]:
# update record in table as duplicate

def update_is_privacy(curs, db, dup, aID):

    # compile query
    insertQuery = "UPDATE articles SET is_privacy = (%s) WHERE article_id = (%s)"
    insertValues = (dup, aID)
    
    curs.execute(insertQuery, insertValues)
    
    db.commit() # commit query


In [12]:
MODEL = model

In [13]:
articles_to_predict = select_articles_for_prediction(mycursor, mydb, "TG")

In [14]:
contents_to_predict = {}

for a in articles_to_predict.keys():
    
    contents_to_predict[a] = {
        "id": a,
        "text": articles_to_predict[a]["title"] + " " + articles_to_predict[a]["content"]
    }


In [15]:
len(contents_to_predict)

12584

In [16]:
import torch
import numpy as np

In [17]:
priv = 0
non_priv = 0
verdicts = {}
ctr = 0


for i in contents_to_predict.keys():
    v = is_privacy(contents_to_predict[i]["text"])
    verdicts[i] = v
    
    ctr += 1
    if ctr % 300 == 0:
        print("Predicted:", ctr, "Left:", len(contents_to_predict) - ctr)
    if v == 1:
        priv += 1
    elif v == 0:
        non_priv += 1

Predicted: 300 Left: 12284
Predicted: 600 Left: 11984
Predicted: 900 Left: 11684
Predicted: 1200 Left: 11384
Predicted: 1500 Left: 11084
Predicted: 1800 Left: 10784
Predicted: 2100 Left: 10484
Predicted: 2400 Left: 10184
Predicted: 2700 Left: 9884
Predicted: 3000 Left: 9584
Predicted: 3300 Left: 9284
Predicted: 3600 Left: 8984
Predicted: 3900 Left: 8684
Predicted: 4200 Left: 8384
Predicted: 4500 Left: 8084
Predicted: 4800 Left: 7784
Predicted: 5100 Left: 7484
Predicted: 5400 Left: 7184
Predicted: 5700 Left: 6884
Predicted: 6000 Left: 6584
Predicted: 6300 Left: 6284
Predicted: 6600 Left: 5984
Predicted: 6900 Left: 5684
Predicted: 7200 Left: 5384
Predicted: 7500 Left: 5084
Predicted: 7800 Left: 4784
Predicted: 8100 Left: 4484
Predicted: 8400 Left: 4184
Predicted: 8700 Left: 3884
Predicted: 9000 Left: 3584
Predicted: 9300 Left: 3284
Predicted: 9600 Left: 2984
Predicted: 9900 Left: 2684
Predicted: 10200 Left: 2384
Predicted: 10500 Left: 2084
Predicted: 10800 Left: 1784
Predicted: 11100 Lef

In [18]:
print("Privacy:", priv)
print(round(priv * 100 / len(contents_to_predict.keys()), 2))

print("Non-privacy:", non_priv)
print(round(non_priv * 100 / len(contents_to_predict.keys()), 2))

Privacy: 7435
59.08
Non-privacy: 5149
40.92


In [19]:
for a in verdicts.keys():
    if verdicts[a] == 1:
        #print("update to privacy")
        update_is_privacy(mycursor, mydb, "privacy", a)
        
    elif verdicts[a] == 0:
        #print("update to non-privacy")
        update_is_privacy(mycursor, mydb, "non-priv", a)


In [64]:
# # db methods
# def select_if_dup(curs, db, a):
    
#     curs.execute("SELECT is_privacy FROM articles WHERE article_id = '" + a + "'") 
    
#     arts = {}
#     result = curs.fetchall()
#     for r in result:
#         arts[a] = {
#             "is_privacy": r[0]
#         }
    
#     return arts

In [65]:
# verdicts = {}
# for i in ids:
#     temp = select_if_dup(mycursor, mydb, i)
#     verdicts[i] = temp[i]["is_privacy"]

In [66]:
# for i in verdicts.keys():
#     if verdicts[i] == "duplicate":
#         print(i)

In [67]:
# # update record in table as duplicate

# def update_tone(curs, db, dup, aID):

#     # compile query
#     insertQuery = "UPDATE articles SET is_privacy = (%s) WHERE article_id = (%s)"
#     insertValues = (dup, aID)
    
#     curs.execute(insertQuery, insertValues)
    
#     db.commit() # commit query


In [68]:
# update_tone(mycursor, mydb, "duplicate", "SMH_2629")

In [17]:
!pgrep rsession

2362
3089
4501
10445
10477
10509
10542
10574
10606
10638
10670
10705
10737
10770
10806
10838
10873
10906
10938
10971
11003
11038


In [18]:
print("blah")

blah


In [21]:
!top

[?1h=[H[2J[mtop - 19:34:10 up 305 days, 20:18, 11 users,  load average: 14.25, 53.57, 87.21[m[m[m[m[K
Tasks:[m[m[1m 704 [m[mtotal,[m[m[1m   3 [m[mrunning,[m[m[1m 504 [m[msleeping,[m[m[1m   0 [m[mstopped,[m[m[1m   1 [m[mzombie[m[m[m[m[K
%Cpu(s):[m[m[1m  8.2 [m[mus,[m[m[1m  1.0 [m[msy,[m[m[1m  0.0 [m[mni,[m[m[1m 90.7 [m[mid,[m[m[1m  0.0 [m[mwa,[m[m[1m  0.0 [m[mhi,[m[m[1m  0.1 [m[msi,[m[m[1m  0.0 [m[mst[m[m[m[m[K
KiB Mem :[m[m[1m 19799081+[m[mtotal,[m[m[1m 79221968 [m[mfree,[m[m[1m 79987840 [m[mused,[m[m[1m 38780996 [m[mbuff/cache[m[m[m[m[K
KiB Swap:[m[m[1m 31250428 [m[mtotal,[m[m[1m  3598432 [m[mfree,[m[m[1m 27651996 [m[mused.[m[m[1m 11557044+[m[mavail Mem [m[m[m[m[K
[K
[7m  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND     [m[m[K
[m 3087 root      20   0 28.999g 0.021t  22908 S  1016 11.6  77:06.72 python3     [m[m[K
