In [15]:
import pandas as pd
import re
import numpy as np
import json

## Define content (import from scrapy script):

In [16]:
# Read data from file:
data = json.load(open("pages_content.json"))

## Define Functions:

In [2]:
#Prepare content 

#Gets a list of just the content from (urls against content)
def strip_url_from_content(content):
    return pd.DataFrame(content.items())[1].tolist()

def url_against_page_no(content):
    all_urls_page_num = {
    'page_number': list(range(1, len(content)+1, 1)),
    'url': pd.DataFrame(content.items())[0].tolist()
    }
    return pd.DataFrame(all_urls_page_num)
    

In [3]:
#Splitting out content:
#Split content into each page (all content per page):

def split_by_whole_page(content):
    content = strip_url_from_content(content)
    page_content = pd.DataFrame([content, list(range(1, len(content)+1))]).T
    page_content.columns = ['phrase', 'page_number']
    return page_content
                         

#Split content into sentences per page:
def split_by_sentence(content):
    all_phrases = []
    for page_num, (url, content) in enumerate(content.items(), start=1):
        content = content.split(". ")
        for phrase in content:
            all_phrases.append((phrase, page_num))
    df_phrases = pd.DataFrame(all_phrases, columns = ['phrase', 'page_number'])
    return df_phrases

#Split content into sentences per page:
def split_by_sentence_1(content):
    all_phrases = [
        (phrase, page_num)
        for content, page_num in enumerate(content, start=1)
        for phrase in content.split(". ")
    ]
    df_phrases = pd.DataFrame(all_phrases, columns = ['phrase', 'page_number'])
    return df_phrases



In [4]:
#Define classification (in, not in, exclusion):

#Terms present:
def find_terms_in(terms_list, df_o, terms_list_name):
    df = df_o.copy()
    phrases = df['phrase'].tolist()
    all_terms = []
    for phrase in phrases:
        terms_per_phrase = []
        for term in terms_list:
            if term in phrase:
                present = term
            else: 
                present = 'NaN'
            terms_per_phrase.append(present)
        all_terms.append(terms_per_phrase)
    all_terms = pd.DataFrame(all_terms)
    df[terms_list_name]=all_terms.apply(lambda row: ', '.join(row.values.astype(str)), axis = 1)
    df[terms_list_name] = df[terms_list_name].str.replace('NaN, ', '').str.replace(', NaN', '').str.replace('NaN', '')
    return df

#Find terms not present:
def find_terms_notin(terms_list, df_o, terms_list_name):
    df = df_o.copy()
    phrases = df['phrase'].tolist()
    all_terms = []
    for phrase in phrases:
        terms_per_phrase = []
        for term in terms_list:
            if term not in phrase:
                present = term
            else: 
                present = 'NaN'
            terms_per_phrase.append(present)
        all_terms.append(terms_per_phrase)
    all_terms = pd.DataFrame(all_terms)
    df[terms_list_name]=all_terms.apply(lambda row: ', '.join(row.values.astype(str)), axis = 1)
    df[terms_list_name] = df[terms_list_name].str.replace('NaN, ', '').str.replace(', NaN', '').str.replace('NaN', '')
    return df

In [None]:
#For in-page, find sentences around the trigger word/s:



## Start defining content, terms lists, and rules here:

In [34]:
#Define mock content

content_1 = {
    'url1': "banana ice cream sale.", #Page 1
    'url2': "chocolate ganache", #Page 2
    'url3': "While strawberry is out of stock, try raspberry instead because you might just love it. This is a disclaimer.", #Page 3
    'url4': "Cake range #1. Come try our desserts. You're guaranteed to love them. This is a disclaimer.", #Page 4
    'url5': "Cake range #2. Come try our best value desserts. You're guaranteed to love them.", #Page 5
    }


content_2 = {
    'url1': "banana ice cream", #Page 1
    }

chinese_content = ['热门论文浏览量', #Page 1
'中国特色社会主义制度优势和国家治理任务——由抗击"新冠肺炎"重大疫情引起的思考与建言13304', #Page 2
'重大疫情治理中的中国制度优势11613',
'从工程实际出发,结合鱼雷作战效能的动态性,综合考虑发射装置对鱼雷作战效能的影响,在鱼雷作战效能模型中引入发射装置的影响因素,同时对传统鱼雷寿命剖面进行了重新划分,首次把鱼雷在发射平台的装载阶段归入鱼雷的任务剖面,充分考虑鱼雷在整个寿命剖面内储存可靠性,装载可靠性和实航可靠性的时序性,则实航可靠性与储存可靠性,装载可靠性存在一定的关系,应用传统WSEIAC效能模型对鱼雷作战效能进行分析计算.我们已与文献出版商建立了直接购买合作。',
'接购买合作' 
              ]



In [6]:
#Define Terms Lists

term_list_1 = ['offer', 'sale', 'try'] #Offer language
#{ }
term_list_2 = ['guaranteed', 'best value', 'love'] #High-risk words
term_list_3 = ['might'] #Exclusion
term_list_4 = ['This is a disclaimer'] #Dislaimer
term_list_5 = ['同', '建立', '与', '接购买合作', '门'] #Chinese characters

In [46]:
#Define rules


#Rule 1 finds high-risk phrases on a whole page with terms lists 1(in), 2(in), 3(exclusions)
def rule_1(content):
    y = find_terms_in(term_list_1, split_by_whole_page(content), terms_list_name = 'terms_list_1_in')
    y = find_terms_in(term_list_2, y, terms_list_name = 'terms_list_2_in')
    y = find_terms_notin(term_list_3, y, terms_list_name = 'terms_list_3_excluded')
    y['rule'] = rule_names[rule_1.__name__]
    return y

#Rule 2 finds high-risk phrases in a sentence on a page.
def rule_2(content):
    y = find_terms_in(term_list_2, split_by_sentence(content), terms_list_name = 'high_risk_words')
    y['rule'] = rule_names[rule_2.__name__]
    return y

#Rule 3 finds a missing disclaimer for an offer.
def rule_3(content):
    y = find_terms_in(term_list_1, split_by_whole_page(content), terms_list_name = 'terms_list_1_in')
    y = find_terms_notin(term_list_4, y, terms_list_name = 'missing_disclaimer')
    y['rule'] = rule_names[rule_3.__name__]
    return y

rule_names = {
    rule_1.__name__: 'Rule 1: High-risk promotions on page',
    rule_2.__name__: 'Rule 2: High-risk words in sentences',
    rule_3.__name__: 'Rule 3: Missing disclaimers'
}



In [49]:
#Return only the triggered rows for one rule and one content set:
def find_triggered_items(rule, content):
    list_of_content = strip_url_from_content(content)
    triggered = rule(content)
    for col in triggered.columns.tolist()[2:]:
        triggered = triggered[triggered[col] != ""]
    df = pd.DataFrame(url_against_page_no(content))
    triggered = pd.merge(triggered, df, on='page_number', how='left')
    if triggered.empty == True:
        return print('No phrases triggered for ' + rule_names[rule.__name__] + '.')
    else:
        return triggered.drop(['page_number'], axis = 1)
   
#Run multiple rules against some content


In [35]:
#Test 1

risky_promotions_1 = find_triggered_items(rule_1, content_1)
risky_promotions_1 = risky_promotions_1.drop('terms_list_3_excluded', axis = 1)
risky_promotions_2 = find_triggered_items(rule_2, content_1)
risky_promotions = pd.concat([risky_promotions_1, risky_promotions_2])
risky_promotions

Unnamed: 0,phrase,terms_list_1_in,terms_list_2_in,rule,url,high_risk_words
0,Cake range #1. Come try our desserts. You're g...,try,"guaranteed, love",Rule 1: High-risk promotions on page,url4,
1,Cake range #2. Come try our best value dessert...,try,"guaranteed, best value, love",Rule 1: High-risk promotions on page,url5,
0,"While strawberry is out of stock, try raspberr...",,,Rule 2: High-risk words in sentences,url3,love
1,You're guaranteed to love them,,,Rule 2: High-risk words in sentences,url4,"guaranteed, love"
2,Come try our best value desserts,,,Rule 2: High-risk words in sentences,url5,best value
3,You're guaranteed to love them.,,,Rule 2: High-risk words in sentences,url5,"guaranteed, love"


In [36]:
#Test 2

all_high_risk_phrases = find_triggered_items(rule_2, content_1)
all_high_risk_phrases

Unnamed: 0,phrase,high_risk_words,rule,url
0,"While strawberry is out of stock, try raspberr...",love,Rule 2: High-risk words in sentences,url3
1,You're guaranteed to love them,"guaranteed, love",Rule 2: High-risk words in sentences,url4
2,Come try our best value desserts,best value,Rule 2: High-risk words in sentences,url5
3,You're guaranteed to love them.,"guaranteed, love",Rule 2: High-risk words in sentences,url5


In [52]:
#Test 3

missing_disclaimers = find_triggered_items(rule_3, content_1)
missing_disclaimers

Unnamed: 0,phrase,terms_list_1_in,missing_disclaimer,rule,url
0,banana ice cream sale.,sale,This is a disclaimer,Rule 3: Missing disclaimers,url1
1,Cake range #2. Come try our best value dessert...,try,This is a disclaimer,Rule 3: Missing disclaimers,url5


In [50]:
#Test 4

find_triggered_items(rule_2, content_2)


No phrases triggered for Rule 2: High-risk words in sentences.


In [38]:
#proximity - in page, in sentence
#run multiple rules against some content