### Libraries

In [1]:
!pip install rdflib





In [5]:
import pandas as pd
import csv
import re
import rdflib
from rdflib import Graph, Namespace, Literal, RDF, RDFS

#### Generate the requirements file

In [6]:
with open('requirements.txt', 'w') as file:
    file.writelines(f'pandas=={pd.__version__}\n')
    file.writelines(f'rdflib=={rdflib.__version__}\n')

### Data Loading and pre-processing

In [7]:
# Load the data
df = pd.read_csv('data\8606-subtopics_Clean.csv')
df.head(10)

Unnamed: 0,Question,Answer,topic_name,topic_words,Unnamed: 4
0,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
1,How was the fertilization effect on phytoplank...,The fertilization effect on phytoplankton was ...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
2,How do rising temperatures affect Alpine lakes?,Rising temperatures increase mineral weatherin...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
3,How has an increase in phytoplankton biomass b...,Significant increase in phytoplankton biomass ...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
4,How do higher metabolic rates of organisms and...,Higher metabolic rates of organisms and longer...,Climatological Vegetation,"vegetation, plants, climatological, planting, ...",
5,How does atmospheric deposition represent a ke...,Atmospheric nitrogen deposition in many mounta...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
6,What has been shown to be the outcome of atmos...,Atmospheric deposition has been shown to alter...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
7,What is the natural environment of Tianchi Lake?,Tianchi Lake is an alpine lake located in a na...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
8,What is the purpose of this study?,The purpose of this study is to determine the ...,Researching Ecology,"researching, research, researched, researches,...",
9,What kind of data is being used in the study?,This study is using a dataset of atmospheric c...,Data Analysis Methods,"dataset, data, datasets, datastream, statistic...",


In [8]:
# Drop the unnamed column and nan values
df = df.drop(columns='Unnamed: 4')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8479 entries, 0 to 8478
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question     8479 non-null   object
 1   Answer       8479 non-null   object
 2   topic_name   8479 non-null   object
 3   topic_words  8479 non-null   object
dtypes: object(4)
memory usage: 331.2+ KB


In [9]:
# Save the clean file
df.to_csv('data.csv', index=False)

In [10]:
# Create an RDF graph
g = Graph()
ex = Namespace("http://example.com/")

# Define RDF classes and properties
Question = ex.Question
Answer = ex.Answer
Topic = ex.Topic
Subtopic = ex.Subtopic
hasAnswer = ex.hasAnswer
hasTopic = ex.hasTopic
hasSubtopic = ex.hasSubtopic

# Load data from CSV file
with open('data.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Extract data from CSV columns
        question = row['Question']
        answer = row['Answer']
        topic_name = row['topic_name']
        topic_words = row['topic_words']

        # Remove extra punctuations from the question 
        plain_text = re.sub(r'[^\w\s]', '', question)

        # Create unique URI for the question
        question_uri = ex[plain_text.replace(" ", "_")]

        # Add triples to the graph
        g.add((question_uri, RDF.type, Question))
        g.add((question_uri, RDFS.label, Literal(question)))
        g.add((question_uri, hasAnswer, Literal(answer)))
        g.add((question_uri, hasTopic, Literal(topic_name)))
        g.add((question_uri, hasSubtopic, Literal(topic_words)))

# Serialize the RDF graph to Turtle and n-tripples format
turtle_data = g.serialize(format='turtle')
nt_data = g.serialize(format='nt')

# Save the outputs to output.nt file and outputs.ttl file
with open('output.nt', 'w') as nt_file, open('output.ttl', 'w') as ttl_file:
    nt_file.write(nt_data)
    ttl_file.write(turtle_data)