In [2]:
import pandas as pd, gzip, re, tldextract
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from requests import get
from langdetect import detect

In [28]:
# Input
input_path = '../Data/schema_Product.gz'
phone_lst = ['smartphone', 'phone',
             'phone case', 'phone cable', 'phone charger', 'phone mount',
             'cell phone']

# Output
output_path = '../Data/phone_sites.txt'
new_out_path = '../Data/phones_new.txt'

In [None]:
taxo_pattern = re.compile("<http://schema.org/Product/category>", re.IGNORECASE)
split_pattern = re.compile("^(_:.*)\s(<http:\/\/schema\.org\/Product\/category>)\s(.*)\s<(.*)>\s\.$", re.IGNORECASE)

# Product -> look for product cat -> get source
with gzip.open(input_path,"rt") as f:
    i = 0
    detected = 0
    skipped = 0
    not_taxo = 0
    inserted = 0
    for line in iter(f.readline, ""):
        i += 1
        if not taxo_pattern.search(line):
            not_taxo += 1
            continue
        match = split_pattern.match(line)
        if match is None:
            skipped += 1
            continue
        props = match.groups()
        subj = props[0]
        pred = props[1]
        obj = props[2]
        source = props[3]
        try:
            obj = str(obj).lower()
        except:
            pass
        else:
            pass
        if any(ext in obj for ext in phone_lst):
            detected += 1
            with open(output_path,'a') as file:
                file.write( source + "\n")
                inserted += 1

print("detected " + str(detected) + ", inserted: " + str(inserted) +" lines out of " + str(i) + "; not in taxo: " + str(not_taxo))
print("Done extracting phone & accessories websites from Product.gz file")

In [5]:
df = pd.read_csv(output_path, sep=" ", header=None)
df.columns = ["url"]
df.describe()

Unnamed: 0,url
count,26821
unique,15823
top,https://www.colamco.com/boss-hp12-headphone-hp...
freq,2296


In [6]:
df = df.drop_duplicates(subset='url', keep='first')
df.describe()

Unnamed: 0,url
count,15823
unique,15823
top,http://www.speckproducts.com/apple/iphone-case...
freq,1


In [8]:
def getNetloc(row):
    try:

        return urlparse(row['url']).netloc
    except:
        print("expection: ", row['url'])
    else:
        print("sad", row['url'])

def getSuffix(row):
    try:
        return tldextract.extract(row['netloc']).suffix
    except:
        print("expection: ", row['url'])
    else:
        print("sad", row['url'])

df['netloc'] = df.apply(getNetloc, axis = 1)
df['suffix'] = df.apply(getSuffix, axis = 1)
df.describe()

Unnamed: 0,url,netloc,suffix
count,15823,15823,15823
unique,15823,563,70
top,http://www.speckproducts.com/apple/iphone-case...,www.speckproducts.com,com
freq,1,3740,8785


In [24]:
# df['suffix'].value_counts().tolist()
# df['suffix'].value_counts().index.tolist()
# allow .com, .co.uk, .net, .co.au, .ca, .us, .co.nz, .com.co, .org, .eu, .ie, .me, .shop
df = df.loc[df['suffix'].isin(['com','co.uk','net','co.au','ca','us','co.nz','com.co','org','eu','ie','me','shop'])]
df.describe()

Unnamed: 0,url,netloc,suffix
count,9410,9410,9410
unique,9410,323,12
top,https://www.speckproducts.com/apple/iphone-cas...,www.speckproducts.com,com
freq,1,3740,8785


In [25]:
 
df_domains = pd.DataFrame(url_list, columns=['url'])
print(df_domains)

                              url
0           www.speckproducts.com
1             www.crutchfield.com
2             www.myphonecase.com
3                  nottabelle.com
4                    bestmvno.com
..                            ...
318  www.discount-low-voltage.com
319         rocain.threadless.com
320                  itstyle.shop
321      hopscotch.threadless.com
322     happyronin.threadless.com

[323 rows x 1 columns]


In [26]:
def detectLang(row):
    try:
        url = row['url']
        response = get("http://" + str(url))
        html_soup = BeautifulSoup(response.text, 'html.parser')
        return detect(html_soup.body.text)
        # return url
    except:
        print("exception:", row['url'])
    else: 
        print("sad:", row['url'])

df_domains['lang'] = df_domains.apply(detectLang, axis=1)

exception: icecat.co.uk
exception: www.atomiccellular.com
exception: www.transformyourimages.co.uk
exception: www.asianfoodgrocer.com
exception: www.begoos.com
exception: icecat.us
exception: www.rssd.com
exception: www.easiercap.com
exception: www.dyesublimationblanks.com
exception: shopify.undergroundhiphop.com
exception: onecall.com
exception: www.ncds.ca
exception: mango-office.com


In [27]:
# print(df_domains)
df_phone = df_domains.loc[df_domains['lang'] == "en"]
print(df_phone)

                              url lang
0           www.speckproducts.com   en
1             www.crutchfield.com   en
2             www.myphonecase.com   en
3                  nottabelle.com   en
4                    bestmvno.com   en
..                            ...  ...
317      thebearly.threadless.com   en
318  www.discount-low-voltage.com   en
319         rocain.threadless.com   en
321      hopscotch.threadless.com   en
322     happyronin.threadless.com   en

[271 rows x 2 columns]


In [34]:
with open(new_out_path,'a') as file:
    for index, row in df_phone.iterrows():
        file.write( row['url'] + "\n")

In [None]:
# list of target products and synonyms
movie_lst = ['tv series', 'movies', 'film', 'motion picture', 'movies and tv', 'movies & tv']

df_movies = pd.DataFrame(movie_lst, columns=['CATEGORY'])
df_phone = pd.DataFrame(phone_lst, columns=['CATEGORY'])

def count_cat(row):
    try:
        return df['prodcat'].str.contains(row['CATEGORY']).sum()
    except:
        print("exception: ", row['CATEGORY'])
    else:
        print("sad: ", row['CATEGORY'])

df_movies['OCCURENCE'] = df_movies.apply(count_cat, axis =1)
df_phone['OCCURENCE'] = df_phone.apply(count_cat, axis =1)

print(df_movies)
print(df_phone)