In [105]:
import pandas as pd
import re

# Setup

In [118]:
def list2txt(arr,filename):
    with open(f'{filename}','w') as f:
        f.write('\n'.join(arr))

In [119]:
def replace_nums(label):
    if pd.isna(label):
        return None
    regex = r"[\s]?[0-9]+. "
    return re.sub(regex,"/",label)

In [120]:
def clean(df, dataset):
    if dataset == "manual":
        # 1. Remove NaN rows
        df = df.dropna(axis=0,how="all")
        
    elif dataset == "topics":
        # 1. Rename the pasted data from topics classifier
        df.rename({0:"Website URL", 1:"Unformatted Labels"},axis=1,inplace=True)
        df = df.set_index("Website URL")
        # 2. Remove the numbers and make format similar to manual
        df["Slash Labels"] = df["Unformatted Labels"].apply(replace_nums)
        
    # Make everything lowercase
    df = df.apply(lambda x: x.astype(str).str.lower())
    return df

In [121]:
def extract_labels(label_str):
    '''
    Extract labels from a singular cell
    '''
    if pd.isna(label_str):
        return []
    return label_str.split("/")[1:] # Remove the first empty one

def combine_labels(df, label_count):
    '''
    Returns new column with the distinct labels after theyve been extracted
    '''
    
    # Combine labels in all the rows
    label_list = df[f"Label 1"].apply(extract_labels)
    for i in range(2,label_count+1):
        label_list += df[f"Label {i}"].apply(extract_labels)
        
    label_list = label_list.apply(lambda arr : list(set(arr))) # Remove duplicates
    return label_list

In [122]:
manual_df = pd.read_csv("manual_labelling.csv",skiprows=3,index_col="Website URL")
manual_df

Unnamed: 0_level_0,Label 1,Label 2,Label 3
Website URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trade.atlantiscex.com,/Finance/Financial Planning & Management,/Finance/Investing/Currencies & Foreign Exchange,/Business & Industrial/Advertising & Marketing...
w2.brreg.no,,,
sadlierconnect.com,/Jobs & Education/Education,/Jobs & Education/Education/Colleges & Univers...,/Reference/Educational Resources
webcampus.fdu.edu,/Jobs & Education/Education,/Jobs & Education/Education/Colleges & Univers...,/Reference/Educational Resources
www.softbank.jp,/Finance/Investing/Currencies & Foreign Exchange,/Finance/Credit Cards,/Finance/Accounting & Auditing/Tax Preparation...
...,...,...,...
www.veepee.it,,,
www.sephora.de,,,
thothub.to,,,
www.freefilefillableforms.com,,,


In [123]:
topics_df = pd.read_csv("topics_labelling.csv",header=None)
topics_df

Unnamed: 0,0,1
0,trade.atlantiscex.com,
1,sadlierconnect.com,1. Arts & entertainment
2,webcampus.fdu.edu,229. Colleges & universities
3,www.softbank.jp,215. Internet & telecom 218. Phone service pro...
4,www.camif.fr,207. Home & garden
...,...,...
410,www.yannick.com.tw,1. Arts & entertainment 289. Shopping
411,www.langkahindopools.org,275. Reference
412,www.ngengesport.cd,243. News 299. Sports
413,doctruyen3q.site,100. Books & literature 1. Arts & entertainment


In [124]:
manual_df = clean(manual_df, "manual")
list2txt(list(manual_df.index), "urls.txt")

if "Labels" not in manual_df.columns:
    manual_df["Labels"] = combine_labels(manual_df, 3)
manual_df = manual_df[["Labels"]] # Dont need the other labels now

In [125]:
manual_df

Unnamed: 0_level_0,Labels
Website URL,Unnamed: 1_level_1
trade.atlantiscex.com,"[currencies & foreign exchange, sales, finance..."
sadlierconnect.com,"[educational resources, reference, jobs & educ..."
webcampus.fdu.edu,"[educational resources, reference, jobs & educ..."
www.softbank.jp,"[currencies & foreign exchange, finance, credi..."
www.camif.fr,"[home & garden, home improvement, home & inter..."
...,...
www.yannick.com.tw,"[shopping, cooking & recipes, food & drink]"
www.langkahindopools.org,"[finance, online communities, sports]"
www.ngengesport.cd,"[finance, sports, soccer]"
doctruyen3q.site,"[comics, entertainment industry, books & liter..."


In [126]:
topics_df = clean(topics_df,"topics")
if "Labels" not in topics_df.columns:
    topics_df["Labels"] = topics_df["Slash Labels"].apply(extract_labels)
topics_df = topics_df[["Labels"]]
topics_df

Unnamed: 0_level_0,Labels
Website URL,Unnamed: 1_level_1
trade.atlantiscex.com,[]
sadlierconnect.com,[arts & entertainment]
webcampus.fdu.edu,[colleges & universities]
www.softbank.jp,"[internet & telecom, phone service providers]"
www.camif.fr,[home & garden]
...,...
www.yannick.com.tw,"[arts & entertainment, shopping]"
www.langkahindopools.org,[reference]
www.ngengesport.cd,"[news, sports]"
doctruyen3q.site,"[books & literature, arts & entertainment]"


# Comparison

## Match counts

In [135]:
def correct_label(test, truth):
    '''
    Given a test list (topics label, usually), do all the labels appear in the truth list?
    If its empty, it is not a match. Flag as empty.
    '''
    if not test:
        return False, True # Not a match, empty test
    return all(i in truth for i in test), False
    

In [137]:
combined_df = topics_df
combined_df.rename({"Labels":"Topics"},axis=1,inplace=True)
combined_df["Manual"] = manual_df["Labels"]
combined_df

Unnamed: 0_level_0,Topics,Manual
Website URL,Unnamed: 1_level_1,Unnamed: 2_level_1
trade.atlantiscex.com,[],"[currencies & foreign exchange, sales, finance..."
sadlierconnect.com,[arts & entertainment],"[educational resources, reference, jobs & educ..."
webcampus.fdu.edu,[colleges & universities],"[educational resources, reference, jobs & educ..."
www.softbank.jp,"[internet & telecom, phone service providers]","[currencies & foreign exchange, finance, credi..."
www.camif.fr,[home & garden],"[home & garden, home improvement, home & inter..."
...,...,...
www.yannick.com.tw,"[arts & entertainment, shopping]","[shopping, cooking & recipes, food & drink]"
www.langkahindopools.org,[reference],"[finance, online communities, sports]"
www.ngengesport.cd,"[news, sports]","[finance, sports, soccer]"
doctruyen3q.site,"[books & literature, arts & entertainment]","[comics, entertainment industry, books & liter..."


In [153]:
combined_df[["Match", "Empty"]] = combined_df.apply(lambda x: correct_label(x["Topics"], x["Manual"]), axis=1,result_type='expand')
combined_df.to_csv("combined.csv")
combined_df

Unnamed: 0_level_0,Topics,Manual,Match,Empty
Website URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
trade.atlantiscex.com,[],"[currencies & foreign exchange, sales, finance...",False,True
sadlierconnect.com,[arts & entertainment],"[educational resources, reference, jobs & educ...",False,False
webcampus.fdu.edu,[colleges & universities],"[educational resources, reference, jobs & educ...",True,False
www.softbank.jp,"[internet & telecom, phone service providers]","[currencies & foreign exchange, finance, credi...",False,False
www.camif.fr,[home & garden],"[home & garden, home improvement, home & inter...",True,False
...,...,...,...,...
www.yannick.com.tw,"[arts & entertainment, shopping]","[shopping, cooking & recipes, food & drink]",False,False
www.langkahindopools.org,[reference],"[finance, online communities, sports]",False,False
www.ngengesport.cd,"[news, sports]","[finance, sports, soccer]",False,False
doctruyen3q.site,"[books & literature, arts & entertainment]","[comics, entertainment industry, books & liter...",True,False


In [149]:
combined_df["Match"].value_counts()

False    290
True     125
Name: Match, dtype: int64

In [151]:
print("Percentage match")
temp = combined_df["Match"].value_counts()
(temp[True])/(temp[True]+temp[False]) * 100

Percentage match


30.120481927710845

In [144]:
combined_df["Empty"].value_counts()

False    347
True      68
Name: Empty, dtype: int64

### Excluding empty ones

In [158]:
combined_df[~combined_df.Empty]["Match"].value_counts()

False    222
True     125
Name: Match, dtype: int64

In [159]:
print("Percentage match")
combined_wo_empty_df = combined_df[~combined_df.Empty]
temp = combined_wo_empty_df["Match"].value_counts()
(temp[True])/(temp[True]+temp[False]) * 100

Percentage match


36.023054755043226

In [162]:
combined_wo_empty_df[~combined_wo_empty_df["Match"]].to_csv("mismatches.csv")
combined_wo_empty_df

Unnamed: 0_level_0,Topics,Manual,Match,Empty
Website URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sadlierconnect.com,[arts & entertainment],"[educational resources, reference, jobs & educ...",False,False
webcampus.fdu.edu,[colleges & universities],"[educational resources, reference, jobs & educ...",True,False
www.softbank.jp,"[internet & telecom, phone service providers]","[currencies & foreign exchange, finance, credi...",False,False
www.camif.fr,[home & garden],"[home & garden, home improvement, home & inter...",True,False
www.livefans.jp,[arts & entertainment],"[humor, acting & theater, arts & entertainment...",True,False
...,...,...,...,...
www.yannick.com.tw,"[arts & entertainment, shopping]","[shopping, cooking & recipes, food & drink]",False,False
www.langkahindopools.org,[reference],"[finance, online communities, sports]",False,False
www.ngengesport.cd,"[news, sports]","[finance, sports, soccer]",False,False
doctruyen3q.site,"[books & literature, arts & entertainment]","[comics, entertainment industry, books & liter...",True,False


## Particular categories