In [7]:
import pandas as pd
import rdflib

In [119]:
def csv_to_rdf(csv_file, rdf_file, dtype_mapping=None):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Create an RDF graph
    g = rdflib.Graph()

    # Define a base URI for our RDF graph
    base_uri = rdflib.URIRef("http://example.org/")

    # Iterate over the DataFrame and create RDF triples
    for i, row in df.iterrows():
        row_uri = rdflib.URIRef(f"{base_uri}row{i}")
        for column in df.columns:
            predicate = rdflib.URIRef(f"{base_uri}{column}")
            if column in dtype_mapping:
                object_value = rdflib.Literal(row[column], datatype=dtype_mapping[column])
            else:
                print("type missing in map")
                object_value = rdflib.Literal(row[column])
            g.add((row_uri, predicate, object_value))

    # Serialize the RDF graph to a file (Turtle format)
    g.serialize(destination=rdf_file, format="turtle")
    print(f"RDF graph has been saved to {rdf_file}")

In [115]:
def rdf_to_csv(rdf_file, csv_file):
    # Create an RDF graph and parse the RDF file
    g = rdflib.Graph()
    g.parse(rdf_file, format="turtle")

    # Initialize a dictionary to hold rows for the CSV
    rows = {}

    # Iterate over the RDF triples to build the CSV structure
    for subject, predicate, obj in g:
        if isinstance(subject, rdflib.URIRef) and isinstance(predicate, rdflib.URIRef):
            # Extract the row index from the subject URI (e.g., "row0", "row1", ...)
            row_index = str(subject).split('/')[-1]
            column_name = str(predicate).split('/')[-1]

            # Initialize the row if it doesn't exist
            if row_index not in rows:
                rows[row_index] = {}

            # Add the object value to the corresponding row and column
            rows[row_index][column_name] = obj

    # Convert the dictionary of rows into a DataFrame
    df = pd.DataFrame.from_dict(rows, orient='index')
    # Write the DataFrame to a CSV file
    df.to_csv(csv_file, index=False)
    print(f"CSV file has been saved to {csv_file}")

In [120]:
dmap = {"cap-diameter": rdflib.XSD.int, "cap-shape": rdflib.XSD.int, "gill-attachment": rdflib.XSD.int, "gill-color": rdflib.XSD.int, "stem-height": rdflib.XSD.float, "stem-width": rdflib.XSD.int, "stem-color": rdflib.XSD.int, "season": rdflib.XSD.float, "class": rdflib.XSD.int}
csv_to_rdf('mushroom_cleaned.csv', 'mushroom.ttl', dmap)

RDF graph has been saved to mushroom.ttl


In [85]:
rdf_to_csv('mushroom.ttl', 'mushroom_rdf.csv')

CSV file has been saved to mushroom_rdf.csv


In [91]:
def check_csv(left_side, right_side):
    left = pd.read_csv(left_side)
    right = pd.read_csv(right_side)
    left = left.reindex(sorted(left.columns), axis=1)
    right = right.reindex(sorted(right.columns), axis=1)
    left.sort_values(by=left.columns.tolist(), inplace=True)
    right.sort_values(by=right.columns.tolist(), inplace=True)
    left.reset_index(drop=True, inplace=True)
    right.reset_index(drop=True, inplace=True)
    for col in left.columns:
        if type(left[col][0]) != type(right[col][0]):
            right[col] = right[col].astype(type(left[col][0]))
    return left.equals(right)

In [90]:
print(check_csv('mushroom_cleaned.csv', 'mushroom_rdf.csv'))

True
