### Libraries

In [3]:
!pip install rdflib

Collecting rdflib
  Using cached rdflib-6.3.2-py3-none-any.whl (528 kB)
Collecting pyparsing<4,>=2.1.0
  Using cached pyparsing-3.1.1-py3-none-any.whl (103 kB)
Collecting importlib-metadata<5.0.0,>=4.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting isodate<0.7.0,>=0.6.0
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Installing collected packages: pyparsing, isodate, importlib-metadata, rdflib
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 6.7.0
    Uninstalling importlib-metadata-6.7.0:
      Successfully uninstalled importlib-metadata-6.7.0
Successfully installed importlib-metadata-4.13.0 isodate-0.6.1 pyparsing-3.1.1 rdflib-6.3.2


In [4]:
import pandas as pd
import csv
import re
import rdflib
from rdflib import Graph, Namespace, Literal, RDF, RDFS

#### Generate the requirements file

In [6]:
with open('requirements.txt', 'w') as file:
    file.writelines(f'pandas=={pd.__version__}\n')
    file.writelines(f'rdflib=={rdflib.__version__}\n')

### Data Loading and pre-processing

In [2]:
# Load the data
df = pd.read_csv('data\8606-subtopics_Clean.csv')
df.head(10)

Unnamed: 0,Question,Answer,topic_name,topic_words,Unnamed: 4
0,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
1,How was the fertilization effect on phytoplank...,The fertilization effect on phytoplankton was ...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
2,How do rising temperatures affect Alpine lakes?,Rising temperatures increase mineral weatherin...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
3,How has an increase in phytoplankton biomass b...,Significant increase in phytoplankton biomass ...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
4,How do higher metabolic rates of organisms and...,Higher metabolic rates of organisms and longer...,Climatological Vegetation,"vegetation, plants, climatological, planting, ...",
5,How does atmospheric deposition represent a ke...,Atmospheric nitrogen deposition in many mounta...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
6,What has been shown to be the outcome of atmos...,Atmospheric deposition has been shown to alter...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
7,What is the natural environment of Tianchi Lake?,Tianchi Lake is an alpine lake located in a na...,Bio-Fishery Ecosys.,"fisheries, fishing, zooplankton, bioclimatic, ...",
8,What is the purpose of this study?,The purpose of this study is to determine the ...,Researching Ecology,"researching, research, researched, researches,...",
9,What kind of data is being used in the study?,This study is using a dataset of atmospheric c...,Data Analysis Methods,"dataset, data, datasets, datastream, statistic...",


In [3]:
# Drop the unnamed column and nan values
df = df.drop(columns='Unnamed: 4')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8479 entries, 0 to 8478
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question     8479 non-null   object
 1   Answer       8479 non-null   object
 2   topic_name   8479 non-null   object
 3   topic_words  8479 non-null   object
dtypes: object(4)
memory usage: 331.2+ KB


In [11]:
# Save the clean file
df.to_csv('data\data.csv', index=False)

### Converting to RDF format

In [10]:
# Create an RDF graph
g = Graph()
ex = Namespace("http://example.com/")

# Define RDF classes and properties
Question = ex.Question
Answer = ex.Answer
Topic = ex.Topic
Subtopic = ex.Subtopic
hasAnswer = ex.hasAnswer
hasTopic = ex.hasTopic
hasSubtopic = ex.hasSubtopic

# Load data from CSV file
with open('data.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Extract data from CSV columns
        question = row['Question']
        answer = row['Answer']
        topic_name = row['topic_name']
        topic_words = row['topic_words']

        # Remove extra punctuations from the question 
        plain_text = re.sub(r'[^\w\s]', '', question)

        # Create unique URI for the question
        question_uri = ex[plain_text.replace(" ", "_")]

        # Add triples to the graph
        g.add((question_uri, RDF.type, Question))
        g.add((question_uri, RDFS.label, Literal(question)))
        g.add((question_uri, hasAnswer, Literal(answer)))
        g.add((question_uri, hasTopic, Literal(topic_name)))
        g.add((question_uri, hasSubtopic, Literal(topic_words)))

# Serialize the RDF graph to Turtle and n-tripples format
turtle_data = g.serialize(format='turtle')
nt_data = g.serialize(format='nt')

# Save the outputs to output.nt file and outputs.ttl file
with open('output.nt', 'w') as nt_file, open('output.ttl', 'w') as ttl_file:
    nt_file.write(nt_data)
    ttl_file.write(turtle_data)

### Gephi data

In [5]:
# Read in the data
df = pd.read_csv('data/data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8479 entries, 0 to 8478
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question     8479 non-null   object
 1   Answer       8479 non-null   object
 2   topic_name   8479 non-null   object
 3   topic_words  8479 non-null   object
dtypes: object(4)
memory usage: 265.1+ KB


In [16]:
df['topic_words'].iloc[0]

'fisheries, fishing, zooplankton, bioclimatic, fish, fishery, fishes, biodiversity, phytoplankton, bioavailability, planktonic, lakes, ecotypes, biosphere, ecosystems, aquaculture, biospheric, anglerfish, bioavailable, snailfish, ecology, lake, ecosystem, biosourced, ecological, zoologii, biotic, climatology, biomass, oceanography, aquatic, bioproject, habitats, biogeography, biogeochemical, biofuels, bioenergy, bioorganic, climatological, biogas, habitat, biophysical, biofuel, freshwater, biobased, biomod, seawater, species, biomethane, ecologization'

In [11]:
df['topic_words']

0       fisheries, fishing, zooplankton, bioclimatic, ...
1       fisheries, fishing, zooplankton, bioclimatic, ...
2       fisheries, fishing, zooplankton, bioclimatic, ...
3       fisheries, fishing, zooplankton, bioclimatic, ...
4       vegetation, plants, climatological, planting, ...
                              ...                        
8474    energyplan, energy, energyplus, model, models,...
8475    energyplan, energy, energyplus, model, models,...
8476    energyplan, energy, energyplus, model, models,...
8477    optimization, optimizations, optimizing, optim...
8478    energyplan, thermodynamics, energy, energyplus...
Name: topic_words, Length: 8479, dtype: object

In [10]:
columns = ['Source', 'Target', 'Id', 'Type', 'Label', 'timeset', 'Weight']

# List to store the rows
rows = []

# Iterate through the df
id = 1
for _, row in df.iterrows():
    topic = row['topic_name']
    topic_words = row['topic_words'].split(', ')

    for word in topic_words:
        rows.append({'Id': id, 'Source': topic, 'Target': word})
        id += 1

    # Add question-answer pair as source-target pair
    rows.append({'Id': id, 'Source': row['Question'], 'Target': row['Answer']})
    id += 1

# Create the new DataFrame from the list of dictionaries
df_new = pd.DataFrame(data=rows, columns=columns)


In [11]:
df_new.head(20)

Unnamed: 0,Source,Target,Id,Type,Label,timeset,Weight
0,Bio-Fishery Ecosys.,fisheries,1,,,,
1,Bio-Fishery Ecosys.,fishing,2,,,,
2,Bio-Fishery Ecosys.,zooplankton,3,,,,
3,Bio-Fishery Ecosys.,bioclimatic,4,,,,
4,Bio-Fishery Ecosys.,fish,5,,,,
5,Bio-Fishery Ecosys.,fishery,6,,,,
6,Bio-Fishery Ecosys.,fishes,7,,,,
7,Bio-Fishery Ecosys.,biodiversity,8,,,,
8,Bio-Fishery Ecosys.,phytoplankton,9,,,,
9,Bio-Fishery Ecosys.,bioavailability,10,,,,


In [5]:
df['topic_name'].value_counts()

Data Analysis Methods       504
Energy Research             459
Researching Ecology         412
Publications Titles         340
Thermal Efficiencies        300
                           ... 
Airflow Absorption           32
Authorship Contributions     32
Efficiency Optimization      31
Financial Influences         29
Warmest Climates             21
Name: topic_name, Length: 72, dtype: int64

In [8]:
columns = ['Source', 'Target', 'Id', 'Type', 'Label', 'timeset', 'Weight']
df_new = pd.DataFrame(columns=columns)
df_new['Source'] = df['topic_name']
df_new['Target'] = df['Question']
df_new['Id'] = range(1, len(df)+1)
df_new['Type'] = 'undirected'
df_new.sample(10)

Unnamed: 0,Source,Target,Id,Type,Label,timeset,Weight
4536,Energy Plan Heat,What is a common header pipe used for?,4537,undirected,,,
1986,Optimal Optimization,What was the impact of the optimized models co...,1987,undirected,,,
680,Climate Typology,What linear quadratic model was used in the ap...,681,undirected,,,
6169,Modern Thermal Efficiencies,What was the maximum efficiency for the module?,6170,undirected,,,
1570,Bio-Fishery Ecosys.,What protocols are needed to effectively cope ...,1571,undirected,,,
5899,Energetic Percentages,What is the main question posed to the respond...,5900,undirected,,,
3457,Thermal Modeling,What is the most popular models of Japanese pu...,3458,undirected,,,
5272,Quantifying Coefficients,What is the Taylor's polynomial expression of v?,5273,undirected,,,
6799,Green Energy Synergy,What is the objective behind examining the hed...,6800,undirected,,,
36,Bio-Fishery Ecosys.,What is the main cycle of the air temperature ...,37,undirected,,,


In [9]:
df_new.to_csv('gephi_edges.csv', index=False)

### Neo4J 

In [12]:
df = pd.read_csv('data/data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8479 entries, 0 to 8478
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question     8479 non-null   object
 1   Answer       8479 non-null   object
 2   topic_name   8479 non-null   object
 3   topic_words  8479 non-null   object
dtypes: object(4)
memory usage: 265.1+ KB


In [13]:
df['Id'] = range(1, len(df)+1)

In [15]:
# List to store the new rows
rows = []

# Iterate through the dataframe
for _, row in df.iterrows():
    topic_words = row['topic_words'].split(', ')

    # Iterate through the topic_words
    for word in topic_words:
        rows.append({'Id': row['Id'], 'Question': row['Question'], 'Answer': row['Answer'], 'topic': row['topic_name'], 'sub-topic': word})
        

In [19]:
columns = ['Id', 'Question', 'Answer', 'topic', 'sub-topic']
df_new = pd.DataFrame(data=rows, columns=columns)

In [20]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423950 entries, 0 to 423949
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Id         423950 non-null  int64 
 1   Question   423950 non-null  object
 2   Answer     423950 non-null  object
 3   topic      423950 non-null  object
 4   sub-topic  423950 non-null  object
dtypes: int64(1), object(4)
memory usage: 16.2+ MB


In [21]:
df_new.head(20)

Unnamed: 0,Id,Question,Answer,topic,sub-topic
0,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,fisheries
1,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,fishing
2,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,zooplankton
3,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,bioclimatic
4,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,fish
5,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,fishery
6,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,fishes
7,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,biodiversity
8,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,phytoplankton
9,1,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,Bio-Fishery Ecosys.,bioavailability


In [22]:
df_new.to_csv('neo4j_data.csv', index=False)

In [23]:
df_new.tail()

Unnamed: 0,Id,Question,Answer,topic,sub-topic
423945,8479,What is the purpose of the paper by Bogdanov e...,The paper by Bogdanov et al. explores the role...,Energy Research,paper
423946,8479,What is the purpose of the paper by Bogdanov e...,The paper by Bogdanov et al. explores the role...,Energy Research,investigation
423947,8479,What is the purpose of the paper by Bogdanov e...,The paper by Bogdanov et al. explores the role...,Energy Research,efficient
423948,8479,What is the purpose of the paper by Bogdanov e...,The paper by Bogdanov et al. explores the role...,Energy Research,search
423949,8479,What is the purpose of the paper by Bogdanov e...,The paper by Bogdanov et al. explores the role...,Energy Research,heat
