In [3]:
import re
import ast
import json
import pandas as pd
from pprint import pprint

from urllib.parse import urlparse
from url_parser import parse_url, get_url, get_base_url

import nltk
from nltk.corpus import stopwords

In [2]:
custom_stopwords = ['www', 'www.', 'com', 'les', 'org', 'tag', 'html', 'id', 'un', 'win', 
                    'en', 'me', 'php', 'asp', 'aspx', 'cc', 'net']


In [5]:
def get_size():
    df = pd.read_csv('1.csv')
    print(df.shape)
    df = pd.read_csv('1_label.csv')
    print(df.shape)
    df = pd.read_csv('1_nolabel.csv')
    print(df.shape)
    

def get_iab_categories(iab_categories):
    iab_categories_list = []
    iab_categories = ast.literal_eval(iab_categories)
    for iab_category in iab_categories:
        for value in list(iab_category.values())[:-1]:
            iab_categories_list.append([value, iab_category['score']])
    return iab_categories_list

def get_base_url_(url):
    try:
        return get_base_url(url)
    except:
        return url

def get_domain(url):
    try:
        domain = parse_url(url)['domain']
    except:
        domain = ' '.join(urlparse(url).netloc.split('.'))
    try:
        top_domain = parse_url(url)['top_domain'] or ''
    except:
        top_domain = ''
    try:
        sub_domain = parse_url(url)['sub_domain'] or ''
    except:
        sub_domain = ''
    return domain + ' ' + sub_domain + ' ' + top_domain 
    
def get_path(url):
    try:
        path = parse_url(url)['path']
    except:
        path = urlparse(url).path
    return path

def remove_stop_words(text):
    stop = []
    stop.extend(stopwords.words('english'))
    stop.extend(stopwords.words('french'))
    stop.extend(stopwords.words('spanish'))
    stop.extend(custom_stopwords)
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip()
    text = ' '.join([word for word in text.split() if word not in (stop)])
    
    return text
    
def clean(df, df_name):    
    df_nolabel = df[df['iabCategories'].isnull()]
    df_nolabel.to_csv(f'{df_name}_nolabel.csv')
    
    df = df[~df['iabCategories'].isnull()]
    df = df.fillna('')
    df['iabCategories'] = df['iabCategories'].apply(get_iab_categories)
    df = df.explode('iabCategories', ignore_index = True)
    split = pd.DataFrame(df['iabCategories'].to_list(), columns = ['iab_categories', 'confidence'])
    df = pd.concat([df, split], axis = 1)
    df = df.drop('iabCategories', axis = 1)
    df['base_url'] = df['url'].apply(get_base_url_)
    df['domain'] = df['url'].apply(get_domain) 
    df['path'] = df['url'].apply(get_path)
    df['combine'] = df['channel'] + ' ' + df['domain'] + ' ' + df['path']
    df['combine'] = df['combine'].apply(remove_stop_words)
    df.to_csv(f'{df_name}_label.csv')
    return df
    

In [None]:
df = pd.read_csv('../data/clean/1.csv')
df = clean(df, '1')
# get_size()

In [None]:
df.sample(10)

In [132]:
df = pd.read_csv('test_label.csv')
df['combine'].to_list()

['linkedin facs medical professionals news publications news articles acs brief january issue surgeon well coalition expands resources throughout specialty',
 'facebook rhinews actualites proposition reparation creative lesclavage afro americains recevraient chacun selon democrate',
 'facebook rhinews actualites proposition reparation creative lesclavage afro americains recevraient chacun selon democrate',
 'facebook rhinews actualites proposition reparation creative lesclavage afro americains recevraient chacun selon democrate',
 'facebook rhinews actualites proposition reparation creative lesclavage afro americains recevraient chacun selon democrate',
 'facebook rhinews actualites proposition reparation creative lesclavage afro americains recevraient chacun selon democrate',
 'pelispgo',
 'pelispgo',
 'pelispgo',
 'incestflix SloansMoans page',
 'incestflix SloansMoans page',
 'incestflix SloansMoans page',
 'incestflix SloansMoans page',
 'incestflix SloansMoans page',
 'incestflix 

In [157]:
from url_parser import parse_url, get_url, get_base_url
from urllib.parse import urlparse

In [200]:
urls = [
    'https://www.governmentjobs.com/careers/hawaii/jobs/4347832-0/corrections-education-specialist-iii-correctional-facilities-hawaii-island',
#     'http://161.97.110.226/foursome-2023/#close',
    'https://uk.com/',
#     'https://www.sandipsitaula2023.com.np/2023/12/football-live-10.html',
#     'https://s3.amazonaws.com/facebookaddon/index.html#access_token=EAAPXW9nJcDUBO8IQ8XLC6HDT1beZBsfB1QCSLMvfPi33BbM75h5sawPnPCG4siJJP3sGWOiL9GZCyvbLl2hLE9ZBSCshZAZABC94qknu4RIV1kYqF3caaaHH919Xyzr9Swix3qJDq4U0By20x5NEG6z6JeIuEpAGUKVlHizbOVXml8XKepVYgYPB7W16tBZCBZBWy0ZD&data_access_expiration_time=1713536551&expires_in=5183927',
#     'https://dasao38015.top:3101/',
#     'http://128.199.175.251/contact.html',
#     'http://130.51.20.245/',
#     'https://wm.618851.com:2096/',
#     'https://accessmedicine-mhmedical-com.my.wvsom.edu:2443/',
#     'https://accessmedicine-mhmedical-com.my.wvsom.edu:2443/signin.aspx?msgtype=1&accessRedirect=true&AutoLogin=false&displayIndividualSigninNow=true',
#     'https://accessmedicine-mhmedical-com.my.wvsom.edu:2443/book.aspx?bookID=2988#250593850',
#     'https://xyt.881309.com:2096/',
#     'https://5.253.86.213/room-service-2024/',
#     'http://5.189.136.22/21mph-upload-terbaru/',
]

In [201]:
# pprint(parse_url('https://purenudism.city/'))
print()
for url in urls:
    pprint(urlparse(url))
    print()
    pprint(parse_url(url))


ParseResult(scheme='https', netloc='www.governmentjobs.com', path='/careers/hawaii/jobs/4347832-0/corrections-education-specialist-iii-correctional-facilities-hawaii-island', params='', query='', fragment='')

{'dir': '/careers/hawaii/jobs/4347832-0/',
 'domain': 'governmentjobs',
 'file': 'corrections-education-specialist-iii-correctional-facilities-hawaii-island',
 'fragment': None,
 'path': '/careers/hawaii/jobs/4347832-0/corrections-education-specialist-iii-correctional-facilities-hawaii-island',
 'protocol': 'https',
 'query': None,
 'sub_domain': 'www.',
 'top_domain': 'com',
 'www': 'www'}
ParseResult(scheme='https', netloc='uk.com', path='/', params='', query='', fragment='')



AttributeError: 'NoneType' object has no attribute 'group'