In [1]:
import numpy as np
import networkx as nx
import pandas as pd
from tqdm import tqdm
import csv
import os

In [2]:
path = "assertions.csv"
out_path = "en.csv"

In [3]:
def get_desired_subset(path, desired_lang="en"):
    desired = []
    errors = []
    k = 0
    i = 0
    with open(path, "r", encoding="iso-8859-15") as f:
        for line in tqdm(f):
            i += 1
            try:
                _, relation, edge_from, edge_to, __ = line.split("\t")
            except ValueError:
                errors.append(line)
                continue
            try:
                lang = edge_from.split("/")[2]
                word = edge_from.split("/")[3]
            except IndexError:
                errors.append(line)
                continue
            if lang == desired_lang:
                k += 1
                desired.append((relation, edge_from, edge_to))
            else:
                if (not lang.isalpha()) and (lang != "roa-opt"):
                    print(lang, i)

    return desired, errors


def write_to_csv(desired, out_path):
    with open(out_path, "w") as out:
        csv_out = csv.writer(out)
        for row in desired:
            csv_out.writerow(row)

In [4]:
if not os.path.exists(out_path):
    desired, errors = get_desired_subset(path, "en")
    write_to_csv(desired, out_path)

In [5]:
df = pd.read_csv(out_path, header=None)

In [6]:
df.head()

Unnamed: 0,0,1,2
0,/r/Antonym,/c/en/0/n,/c/en/1
1,/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock
2,/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock
3,/r/Antonym,/c/en/5/n,/c/en/3
4,/r/Antonym,/c/en/a.c/n,/c/en/d.c


In [21]:
df.loc[25]

0                 /r/Antonym
1    /c/en/abate/v/wikt/en_1
2              /c/en/augment
Name: 25, dtype: object

In [7]:
def preprocess_df(df):
    df[0] = df[0].str.replace("/r/", "")
    df = df[~(df[0] == "ExternalURL")]
    df = df[~(df[0].str.startswith("dbpedia"))]

    df[1] = df[1].apply(lambda s: s.split("/")[3])
    df[2] = df[2].apply(lambda s: s.split("/")[3])
    df.columns = ["Relation", "From", "To"]

    return df

In [8]:
df = preprocess_df(df)

In [9]:
df

Unnamed: 0,Relation,From,To
0,Antonym,0,1
1,Antonym,12_hour_clock,24_hour_clock
2,Antonym,24_hour_clock,12_hour_clock
3,Antonym,5,3
4,Antonym,a.c,d.c
...,...,...,...
6344010,UsedFor,zoom_lens,procure_better_shot
6344011,UsedFor,zoom_lens,see_things_bigger
6344012,UsedFor,zoom_lens,seeing_distant_object_more_closely
6344013,UsedFor,zoom_lens,take_pictures


In [10]:
df.to_csv("processed_en.csv")