# **Configurations**

In [None]:
# install required packages
!pip install neo4j datasets dotenv pandas

In [None]:
# import necessary libraries
import pandas as pd
from datasets import load_dataset
from neo4j import GraphDatabase
from itertools import combinations
from dotenv import load_dotenv
import re
import os
import json

from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
db = MorphologyDB.builtin_db()


In [None]:
load_dotenv()
# URI = userdata.get("NEO4J_URI")
# USERNAME = userdata.get("NEO4J_USERNAME")
# PASSWORD = userdata.get("NEO4J_PASSWORD")
URI = os.getenv("NEO4J_URI")
USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")
neo4j_driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

In [None]:
def execute_query(query, parameters=None):
	with neo4j_driver.session() as session:
		result = session.run(query, parameters or {})
		return [record for record in result]
test_query = "MATCH (n) RETURN n"
execute_query(test_query)

# **Load Datasets**

In [None]:
# SAMER_df = pd.read_csv('data/raw/SAMER-Readability-Lexicon-v1.tsv', sep='\t')

# df_dev= pd.read_csv('data/raw/dev.csv')

# data_set = pd.read_csv("hf://datasets/CAMeL-Lab/BAREC-Shared-Task-2025-sent/" + "train.csv")

# data_set = pd.DataFrame(data_set)

test_sent = pd.read_csv("data/raw/test_sent.csv")

# Data Preprocessing

In [None]:
# # Remove Unwanted Columns
# data_set_cleaned = data_set.drop(columns=["Word_Count", "Readability_Level_19", "Readability_Level_7", "Readability_Level_5", "Readability_Level_3", "Annotator", "Document", "Source", "Book", "Author"])

# # Remove Dublicate Rows
# data_set_cleaned = data_set_cleaned.drop_duplicates(subset='Sentence', keep='first')

# # Save the cleaned dataset to a CSV file
# data_set_cleaned.to_csv("cleaned_data_set.csv", index=False) 

# data_set_cleaned.head()
test_sent = test_sent.drop(columns=["Word_Count", "Annotator", "Document", "Source", "Book", "Author", "ID"])

test_sent = test_sent[test_sent["Sentence"] != "#NAME?"]
# test_sent.to_csv("cleaned_test_set.csv", index=False) 

test_sent.head()

In [None]:
# Function to remove diacritics from Arabic text
def remove_diacritics(text):
	arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652\u0670]')
	return re.sub(arabic_diacritics, '', text)

In [None]:
# Remove unwanted columns
SAMER_df = SAMER_df.drop(columns=['Hindawi (5594310)', 'Giga (5594256)','Answer1 - Egyptian', 'Answer2 - Syrian','Answer3 - Saudi Arabian'])

# Split 'lemma#pos' into separate columns
SAMER_df[['lemma', 'pos']] = SAMER_df['lemma#pos'].str.split('#', expand=True)

# Remove the original 'lemma#pos' column
SAMER_df = SAMER_df.drop(columns=['lemma#pos'])

# Remove diacritics from the 'lemma' column
SAMER_df['lemma'] = SAMER_df['lemma'].apply(remove_diacritics)

# Remove duplicates base|d on the 'lemma' column
SAMER_df = SAMER_df.drop_duplicates(subset='lemma', keep='first')

# Save the cleaned DataFrame to a CSV file
SAMER_df.to_csv("cleaned_SAMER_df.csv", index=False)

SAMER_df.head()

# Graph Building

In [None]:
# Generating Lemmas

for i in range(len(SAMER_df)):
	lemma = SAMER_df.iloc[i]["lemma"]
	pos = SAMER_df.iloc[i]["pos"]
	avg_readability = SAMER_df.iloc[i]["readability (rounded average)"]
	freq = SAMER_df.iloc[i]["Occurrences"]

	lemma_query = """MERGE (l:Lemma {lemma: $lemma}) ON CREATE SET l.pos = $pos, l.avg_readability = $avg_readability, l.freq = $freq"""

	lemma_params = {"lemma": lemma, "pos": pos, "avg_readability": avg_readability, "freq": freq}

	execute_query(lemma_query, lemma_params)

In [None]:
# Generating Sentences

# lemma_set = set(SAMER_df['lemma'].astype(str))

# to store pairs of lemmas
# pairs_list = [] 
for i in range(len(test_sent)):
	sentence = test_sent.iloc[i]["Sentence"]
	domain_type = test_sent.iloc[i]["Domain"]
	class_type = test_sent.iloc[i]["Text_Class"]


	sentence = remove_diacritics(sentence)

	sentence_to_lemma = re.findall(r'\b[\w]+\b', sentence)

	sentence_to_lemma = [word for word in sentence_to_lemma if not word.isdigit() and len(word) > 1]

	t_sentence = " ".join(sentence_to_lemma)

	# for word in words:
	# 	if word in lemma_set:
	# 		sentence_to_lemma.append(word)
			
	# pairs = [list(pair) for pair in combinations(set(sentence_to_lemma), 2)]
	# pairs_list.extend(pairs)
	
	sentence_query = """
		MERGE (S:Sentence {id: $id})
		ON CREATE SET S.text = $text

		WITH S
		MERGE (D:Domain {type: $domainType})
		MERGE (C:Class {type: $classType})
		MERGE (S)-[:IN_DOMAIN]->(D)
		MERGE (S)-[:IN_CLASS]->(C)

		WITH S
		UNWIND $lemmas AS lemma
		MATCH (L:Lemma {lemma: lemma})
		MERGE (S)-[r:HAS_LEMMA]->(L)
		ON CREATE SET r.count = 1
		ON MATCH SET r.count = r.count + 1
	"""
	
	sentence_params = {"id": i + 1, "text": t_sentence, "domainType": domain_type, "classType": class_type, "lemmas": sentence_to_lemma}
	
	execute_query(sentence_query, sentence_params)

In [None]:
# # Generating Pairs of Lemmas
# lemmas_pairs_query = """
# 		UNWIND $pairs AS pair
# 		MATCH (l1:Lemma {lemma: pair[0]})
# 		MATCH (l2:Lemma {lemma: pair[1]})

# 		MERGE (l1)-[r1:OCCUR_WITH]->(l2)
# 		ON CREATE SET r1.count = 1
# 		ON MATCH SET r1.count = r1.count + 1

# 		MERGE (l2)-[r2:OCCUR_WITH]->(l1)
# 		ON CREATE SET r2.count = 1
# 		ON MATCH SET r2.count = r2.count + 1"""

# lemmas_pairs_params = {"pairs": pairs_list}
# execute_query(lemmas_pairs_query, lemmas_pairs_params)

# Retrieving Nodes

In [None]:
sentence_query = """MATCH (s:Sentence) RETURN s AS Sentence"""
sentence_records = execute_query(sentence_query)

sentence_nodes = []
for record in sentence_records:
	lemma_node = record['Sentence']
	sentence = {
		"sentence_text": record['Sentence']["text"],
	}
	sentence_nodes.append(sentence)
		
with open("data/json/sentences.json", "w", encoding="utf-8") as f:
		json.dump(sentence_nodes, f, ensure_ascii=False, indent=2)

In [None]:
lemmas_query = """MATCH (l:Lemma) RETURN l AS Lemma"""
lemma_records = execute_query(lemmas_query)

print(lemma_records)

lemma_nodes = []
for record in lemma_records:
	lemma_node = record['Lemma']
	lemma = {
		"lemma": lemma_node["lemma"],
		"pos": lemma_node["pos"],
		"avg_readability": lemma_node["avg_readability"],
		"freq": lemma_node["freq"]
	}
	lemma_nodes.append(lemma)

with open("data/json/lemmas.json", "w", encoding="utf-8") as f:
	json.dump(lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:

sentence_lemma_query = """MATCH (s:Sentence)-[r:HAS_LEMMA]->(l:Lemma) RETURN s AS sentence, r AS relation, l AS lemma"""
sentence_lemma_records = execute_query(sentence_lemma_query)

sentence_lemma_nodes = []
for record in sentence_lemma_records:
	sentence_part = record["sentence"]
	relation_part = record["relation"]
	lemma_part = record["lemma"]["lemma"]

	if analyzer.analyze(lemma_part):
		analyses = analyzer.analyze(lemma_part)
		lemma = analyses[0]['lex']
	else:
		lemma = lemma_part

	lemma = remove_diacritics(lemma)

	sentence_lemma = {
		"sentence_text": sentence_part["text"],
		"relation": relation_part.type,
		"lemma": lemma
	}
	sentence_lemma_nodes.append(sentence_lemma)

with open("sentence_lemma.json", "w", encoding="utf-8") as f:
	json.dump(sentence_lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:
lemma_lemma_query = """MATCH (l1:Lemma)-[r:OCCUR_WITH]->(l2:Lemma)
WHERE l1.lemma < l2.lemma 
RETURN l1 AS lemma1, r AS relation, l2 AS lemma2"""
lemma_lemma_records = execute_query(lemma_lemma_query)


lemma_lemma_nodes = []
for record in lemma_lemma_records:
	lemma1_part = record["lemma1"]
	relation_part = record["relation"]
	lemma2_part = record["lemma2"]
	lemma_lemma = {
		"lemma1": lemma1_part["lemma"],
		"relation": relation_part.type,
		"count": relation_part["count"],
		"lemma2": lemma2_part["lemma"]
	}
	lemma_lemma_nodes.append(lemma_lemma)

with open("data/json/lemma_lemma.json", "w", encoding="utf-8") as f:
	json.dump(lemma_lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:
sentence_class_records = execute_query(sentence_class_query)

sentence_class_nodes = []
for record in sentence_class_records:
	sentence_part = record["sentence"]
	relation_part = record["relation"]
	class_part = record["class"]
	sentence_class = {
		"sentence_text": sentence_part["text"],
		"relation": relation_part.type,
		"class_type": class_part["type"]
	}
	sentence_class_nodes.append(sentence_class)
	
with open("sentence_class.json", "w", encoding="utf-8") as f:
	json.dump(sentence_class_nodes, f, ensure_ascii=False, indent=2)

In [None]:
sentence_domain_query = """MATCH (s:Sentence)-[r:IN_DOMAIN]->(d:Domain) RETURN s AS sentence, r AS relation, d AS domain"""
sentence_domain_records = execute_query(sentence_domain_query)

sentence_domain_nodes = []
for record in sentence_domain_records:
	sentence_part = record["sentence"]
	relation_part = record["relation"]
	domain_part = record["domain"]
	sentence_domain = {
		"sentence_text": sentence_part["text"],
		"relation": relation_part.type,
		"domain_type": domain_part["type"]
	}
	sentence_domain_nodes.append(sentence_domain)
	
with open("sentence_domain.json", "w", encoding="utf-8") as f:
	json.dump(sentence_domain_nodes, f, ensure_ascii=False, indent=2)

In [None]:
import csv
import json

sentence_domain_nodes = []

# Read CSV file
with open("data/raw/test_sent.csv", newline='', encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        sentence_text = row["Sentence"]
        domain_type = row["Domain"]

        sentence_domain = {
            "sentence_text": sentence_text,
            "relation": "IN_DOMAIN",
            "domain_type": domain_type
        }

        sentence_domain_nodes.append(sentence_domain)

# Save to JSON
with open("sentence_domain.json", "w", encoding="utf-8") as f:
    json.dump(sentence_domain_nodes, f, ensure_ascii=False, indent=2)

In [None]:
analyzer = Analyzer(db)

# Test analysis
analyses = analyzer.analyze('الطلاب')
lemma = analyses[0]['lex']
print(lemma)

In [None]:
# Iterate over the JSON data
with open("data/json/test_doc.json", "r", encoding="utf-8") as f:
	test_doc_data = json.load(f)

updated_items = []
for item in test_doc_data:
    id = item.get("ID")

    sentence = item.get("Sentence")

    matching_row = test_doc[test_doc['ID'] == id]

    domain = matching_row["Domain"].iloc[0]

    sentence = remove_diacritics(sentence)

    sentence = re.findall(r'\b[\w]+\b', sentence)

    sentence = [word for word in sentence if not word.isdigit() and len(word) > 1]

    t_sentence = " ".join(sentence)

    item['domain_type'] = domain
    item['Sentence'] = t_sentence
    item["relation"] = "IN_DOMAIN"

    updated_items.append(item)

# Save the updated data to a new JSON file
with open("sentence_domain_doc.json", "w", encoding="utf-8") as f:
    json.dump(updated_items, f, ensure_ascii=False, indent=2)


In [None]:
# Iterate over the JSON data

with open("data/json/test_doc.json", "r", encoding="utf-8") as f:
	test_doc_data = json.load(f)

updated_items = []
for item in test_doc_data:
    id = item.get("ID")

    sentence = item.get("Sentence")

    matching_row = test_doc[test_doc['ID'] == id]

    text_class = matching_row["Text_Class"].iloc[0]

    sentence = remove_diacritics(sentence)

    sentence = re.findall(r'\b[\w]+\b', sentence)

    sentence = [word for word in sentence if not word.isdigit() and len(word) > 1]

    t_sentence = " ".join(sentence)

    item['class_type'] = text_class
    item['Sentence'] = t_sentence
    item["relation"] = "IN_CLASS"

    updated_items.append(item)

# Save the updated data to a new JSON file
with open("sentence_class_doc.json", "w", encoding="utf-8") as f:
    json.dump(updated_items, f, ensure_ascii=False, indent=2)


In [None]:
# Iterate over the JSON data

with open("data/json/sentence_lemma.json", "r", encoding="utf-8") as f:
	sentence_lemma_data = json.load(f)


with open("sentence_class_doc.json", "r", encoding="utf-8") as f:
	sentnece_class_doc_data = json.load(f)

updated_items = []
for item in sentence_lemma_data:
    sentence = item.get("sentence_text") # sentence in sentnece_lemma.json

    # matching_row = test_doc[test_doc['ID'] == id]
	#
    # text_class = matching_row["Text_Class"].iloc[0]
	#
    # sentence = remove_diacritics(sentence)
	#
    # sentence = re.findall(r'\b[\w]+\b', sentence)
	#
    # sentence = [word for word in sentence if not word.isdigit() and len(word) > 1]

    # t_sentence = " ".join(sentence)

    result = next((item for item in sentnece_class_doc_data if item.get("Sentence") == sentence), None)

    ID = result["ID"]

    lemma = remove_diacritics(item["lemma"])

    x = {
        "relation": "HAS_LEMMA",
        "ID": ID,
		"Sentence": sentence,
        "lemma": lemma,
	}
    updated_items.append(x)

# Save the updated data to a new JSON file
with open("sentence_lemma_doc.json", "w", encoding="utf-8") as f:
    json.dump(updated_items, f, ensure_ascii=False, indent=2)
