In [3]:
import pandas as pd
import string
import csv
import re
import gensim
from langdetect import detect
from gensim.utils import tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

input_file = '../../data/urls_description.csv'
write_file = '../../data/urls_keywords.csv'
input_file = '/acornML/resources/files/web_crawler/urls_description.csv'
write_file = '/acornML/resources/files/web_crawler/urls_keywords.csv'

weird_char = '\xfe'


def process_text(text):
    words = tokenize(text, lower=True)
    return ' '.join(list(filter(lambda word: word not in string.punctuation and word.isalpha(), words)))


def check_language(text):
    try:
        return detect(text)
    except:
        return ''


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


def process_too_long_text(text):
    sent = list(gensim.summarization.textcleaner.get_sentences(text))
    idx = min(len(sent), 3)
    return ' '.join(sent[0:idx])


def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]

    return results


def extract_description_pattern_is(text, company_name):
    # Extract description following pattern: <Company name> is ...
    description = ''
    if company_name in text:
        idx = text.find(company_name)
        potential_text = text[idx:idx + len(company_name) + 30]
        if ' is ' in potential_text:
            end_idx = len(text)
            for sym in ['*', '#', '|', '>']:
                sym_idx = text.find(sym, idx)
                if sym_idx != -1:
                    end_idx = min(end_idx, sym_idx)

            extracted = text[idx:end_idx]
            sent = list(gensim.summarization.textcleaner.get_sentences(extracted))
            extracted_desc = sent[0]
            if extracted_desc.split(' ')[-1] in {'Inc.', 'U.S.'} and len(sent) > 1:
                extracted_desc += ' ' + sent[1]
            if len(extracted_desc.split()) > 5:
                description = extracted_desc.replace('\n', ' ')
    return description


def extract_description_pattern_who_we_are(text):
    # Extract description following pattern: Who we are, ...
    description = ''
    text = text.split('\n')
    for idx, line in enumerate(text):
        for marker in ['# who we are', '# overview', '# company overview', '# about', '# mission',
                       '# our mission']:
            if marker in line.lower():
                start_idx = idx + 1
                while start_idx < len(text):
                    if text[start_idx] == '':
                        start_idx += 1
                    else:
                        break

                end_idx = start_idx + 1
                while end_idx < len(text):
                    if text[end_idx] != '':
                        end_idx += 1
                    else:
                        break

                if start_idx < len(text):
                    extracted = ' '.join(text[start_idx:end_idx])
                    extracted_desc = process_too_long_text(extracted)

                    extracted_desc_splits = extracted_desc.split()
                    if len(extracted_desc_splits) > 5 and extracted_desc_splits[0][0] != '*':
                        if extracted_desc_splits[0][0] == '#':
                            extracted_desc = ' '.join(extracted_desc_splits[1:])
                        description = extracted_desc
                        break
    return description

def preprocess_df(df):
    df['text'] = df['text'].str.replace(weird_char, '\n')
    df['about_us_text'] = df['about_us_text'].str.replace(weird_char, '\n')
    df['description'] = df['description'].str.replace(weird_char, '\n')

    df['text_lang'] = df['text'].apply(lambda x: check_language(x))
    df['about_us_text_lang'] = df['text'].apply(lambda x: check_language(x))

    return df

In [4]:
df = pd.read_csv(input_file, sep='\t', quoting=csv.QUOTE_NONE).fillna('')
df = preprocess_df(df)

docs = []
for index, row in df.iterrows():
    for field, field_lang in {'text':'text_lang', 'about_us_text':'about_us_text_lang'}.items():
        if row[field] and row[field_lang] == 'en':
            docs.append(process_text(row[field]))

# remove words appearing in more than 95% / less than 1% of all documents
cv = CountVectorizer(max_df=0.95, min_df=0.01, stop_words=gensim.parsing.preprocessing.STOPWORDS)
word_count_vector = cv.fit_transform(docs)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

feature_names = cv.get_feature_names()

In [5]:
count_valid_description = 0
for index, row in df.iterrows():
    
    # check if the description from meta field is in English and has more than 3 tokens
    if check_language(row['description']) == 'en' and len(row['description'].split()) > 3:
        description = row['description']
        description = process_too_long_text(description)
        count_valid_description += 1
    else:
        description = ''

    keywords_text = []
    for field, field_lang in {'text':'text_lang', 'about_us_text':'about_us_text_lang'}.items():
        if row[field]:
            if row[field_lang] != 'en':
                continue

            keywords_text.append(row[field])

            if not description:
                description = extract_description_pattern_is(row[field], row['name'])
                if not description:
                    description = extract_description_pattern_who_we_are(row[field])

    df.loc[index, 'description'] = description
    
    keywords_text = ' '.join(keywords_text)
    
    # Check if the website text + about us page text has more than 50 tokens
    if len(keywords_text.split()) > 50:
        keywords_text += ' ' + description
        tf_idf_vector = tfidf_transformer.transform(cv.transform([keywords_text]))
        sorted_items = sort_coo(tf_idf_vector.tocoo())
        keywords = extract_topn_from_vector(feature_names, sorted_items, 10)
    else:
        keywords = {}

    df.loc[index, 'keywords'] = ', '.join(['%s: %s' % (k, v) for (k, v) in keywords.items()]) #' '.join(keywords)

In [6]:
print(count_valid_description)
print(df[df['description']!=''].shape)
print(df[df['keywords']!=''].shape)

2574
(3699, 10)
(5306, 10)


In [7]:
df[(df['description']!='')&(df['keywords']!='')][['name','url','description','keywords']]

Unnamed: 0,name,url,description,keywords
5,Walton & Post Inc,www.waltonpost.com,"Walton & Post, Inc., is a food and personal care products distributor, providing wholesalers and retailers with nationally and internationally known brands throughout the United States and the world.","brands: 0.43, post: 0.375, excellence: 0.191, food: 0.191, retailers: 0.176, nationally: 0.176, internationally: 0.176, states: 0.175, distributor: 0.169, united: 0.167"
8,Rice Software Consulting L L C,ricesoftware.com,"At Rice Software Consulting, LLC our mission is to help professionals get the most out of their time. We specialize in training and consulting services for: LexisNexis Practice Management software packages including: Front Office, Back Office, Time MattersÂ®, Billing Mattersâ¢, Billing Mattersâ¢ Plus Accounting as well as links from those software packages; to associated software packages such as Timeslips, QuickBooks, CrystalReports and Hot Docs.","software: 0.49, billing: 0.409, packages: 0.383, consulting: 0.269, consultant: 0.175, plus: 0.164, llc: 0.148, practice: 0.146, office: 0.144, time: 0.142"
12,Health Officers Association of California,calhealthofficers.org,Health Officers Association of California (HOAC) is a membership,"california: 0.377, health: 0.348, officers: 0.26, legislative: 0.25, membership: 0.181, care: 0.177, public: 0.177, day: 0.165, law: 0.157, communications: 0.145"
17,Management Consultants,managementconsultants.com,"Since 1976, a business consulting firm specializing in Profit Improvement, Corporate Development, Preemptive Turnaround and CEO Coaching.","performance: 0.31, corporate: 0.283, improvement: 0.251, traditional: 0.178, profits: 0.176, causes: 0.164, profit: 0.163, culture: 0.138, unique: 0.134, turnaround: 0.128"
33,BEST Robotics Inc.,bestinc.org,"BEST Robotics Inc. (BRI) is a non-profit, volunteer-based organization.","best: 0.585, competition: 0.204, students: 0.203, hub: 0.191, schools: 0.163, engineering: 0.14, provider: 0.136, regional: 0.132, official: 0.128, participate: 0.125"
43,Davarcci Associates Inc.,davarcci.com,Davarcci Associates Inc. is an independently and privately held boutique,"transformation: 0.303, data: 0.272, methodology: 0.266, associates: 0.255, contacts: 0.234, agile: 0.197, independently: 0.151, privately: 0.149, lifecycle: 0.146, boutique: 0.144"
48,"Central Carolina Skating Club, Inc.",centralcarolinasc.com,Central Carolina Skating Club Events,"carolina: 0.467, test: 0.317, club: 0.303, membership: 0.25, central: 0.238, visit: 0.229, calendar: 0.208, session: 0.194, orange: 0.17, awards: 0.13"
51,Winning Work,winningwork.com,"This website is for sale! winningwork.com is your first and best source for all of the information you’re looking for. From general topics to more of what you would expect to find here, winningwork.com has it all.","domain: 0.489, com: 0.293, owner: 0.286, sale: 0.259, asking: 0.198, advertisers: 0.194, maintains: 0.192, controlled: 0.19, parking: 0.19, generated: 0.187"
52,QuickFlows,quickflows.com,"QuickFlows is a cloud based Business Process Management (BPM) platform and services aimed at orchestrating people and systems working together. QuickFlows offers a complete and well integrated service for discovering, designing, implementing, deploying, executing and monitoring custom business process centric applications.","adapt: 0.258, monitoring: 0.206, contact: 0.202, know: 0.194, work: 0.193, process: 0.193, support: 0.193, feedback: 0.188, quickly: 0.185, llc: 0.175"
55,Capital City Media,capitalcitymedia.co.uk,"Capital City Media is the oldest finance media agency in the UK, with expertise in planning and buying media inventory, online advertising and benchmarking.","capital: 0.439, city: 0.391, media: 0.373, advertising: 0.294, mike: 0.197, season: 0.193, director: 0.165, uk: 0.129, clients: 0.12, insight: 0.106"
