# **Configurations**

In [1]:
# install required packages
!pip install neo4j datasets




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# import necessary libraries
import pandas as pd
from datasets import load_dataset
from neo4j import GraphDatabase
from itertools import combinations
from dotenv import load_dotenv
import re
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
# URI = userdata.get("NEO4J_URI")
# USERNAME = userdata.get("NEO4J_USERNAME")
# PASSWORD = userdata.get("NEO4J_PASSWORD")
URI = os.getenv("NEO4J_URI")
USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")
neo4j_driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

In [4]:
def execute_query(query, parameters=None):
	with neo4j_driver.session() as session:
		result = session.run(query, parameters or {})
		return [record for record in result]
# test_query = "MATCH (n) RETURN n"
# execute_query(test_query)

# **Load Datasets**

In [5]:
SAMER_df = pd.read_csv('data/raw/SAMER-Readability-Lexicon-v1.tsv', sep='\t')

df_dev= pd.read_csv('data/raw/dev.csv')

data_set = pd.read_csv("hf://datasets/CAMeL-Lab/BAREC-Shared-Task-2025-sent/" + "train.csv")

data_set = pd.DataFrame(data_set)

# Data Preprocessing

In [6]:
# Remove Unwanted Columns
data_set_cleaned = data_set.drop(columns=["Word_Count", "Readability_Level_19", "Readability_Level_7", "Readability_Level_5", "Readability_Level_3", "Annotator", "Document", "Source", "Book", "Author"])

# Remove Dublicate Rows
data_set_cleaned = data_set_cleaned.drop_duplicates(subset='Sentence', keep='first')

# Save the cleaned dataset to a CSV file
data_set_cleaned.to_csv("cleaned_data_set.csv", index=False) 

data_set_cleaned.head()

Unnamed: 0,ID,Sentence,Readability_Level,Domain,Text_Class
0,10100290001,مجلة كل الأولاد وكل البنات,7-zay,Arts & Humanities,Foundational
1,10100290002,ماجد,1-alif,Arts & Humanities,Foundational
2,10100290003,الأربعاء 21 يناير 1987,8-Ha,Arts & Humanities,Foundational
3,10100290004,الموافق 21 جمادى الأول 1407هــ,7-zay,Arts & Humanities,Foundational
4,10100290005,السنة الثامنة,5-ha,Arts & Humanities,Foundational


In [7]:
# Function to remove diacritics from Arabic text
def remove_diacritics(text):
	arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652\u0670]')
	return re.sub(arabic_diacritics, '', text)

In [8]:
# Remove unwanted columns
SAMER_df = SAMER_df.drop(columns=['Hindawi (5594310)', 'Giga (5594256)','Answer1 - Egyptian', 'Answer2 - Syrian','Answer3 - Saudi Arabian'])

# Split 'lemma#pos' into separate columns
SAMER_df[['lemma', 'pos']] = SAMER_df['lemma#pos'].str.split('#', expand=True)

# Remove the original 'lemma#pos' column
SAMER_df = SAMER_df.drop(columns=['lemma#pos'])

# Remove diacritics from the 'lemma' column
SAMER_df['lemma'] = SAMER_df['lemma'].apply(remove_diacritics)

# Remove duplicates based on the 'lemma' column
SAMER_df = SAMER_df.drop_duplicates(subset='lemma', keep='first')

# Save the cleaned DataFrame to a CSV file
SAMER_df.to_csv("cleaned_SAMER_df.csv", index=False)

SAMER_df.head()

Unnamed: 0,Occurrences,Gloss,readability (rounded average),lemma,pos
0,335409,in,1,في,prep
1,270096,from,1,من,prep
2,181283,that,2,أن,conj_sub
3,178560,on;above#on_+_what/which,1,على,prep
4,157818,to;towards,1,إلى,prep


# Graph Building

In [None]:
# Generating Lemmas

for i in range(len(SAMER_df)):
	lemma = SAMER_df.iloc[i]["lemma"]
	pos = SAMER_df.iloc[i]["pos"]
	avg_readability = SAMER_df.iloc[i]["readability (rounded average)"]
	freq = SAMER_df.iloc[i]["Occurrences"]

	lemma_query = """MERGE (l:Lemma {lemma: $lemma}) ON CREATE SET l.pos = $pos, l.avg_readability = $avg_readability, l.freq = $freq"""

	lemma_params = {"lemma": lemma, "pos": pos, "avg_readability": avg_readability, "freq": freq}

	execute_query(lemma_query, lemma_params)

In [None]:
# Generating Sentences

lemma_set = set(SAMER_df['lemma'].astype(str))

# to store pairs of lemmas
pairs_list = [] 
for i in range(len(data_set_cleaned)):
	sentence = data_set_cleaned.iloc[i]["Sentence"]
	domain_type = data_set_cleaned.iloc[i]["Domain"]
	class_type = data_set_cleaned.iloc[i]["Text_Class"]

	words = re.findall(r'\b[\w]+\b', sentence)
	sentence_to_lemma = []
	for word in words:
		if word in lemma_set:
			sentence_to_lemma.append(word)
			
	pairs = [list(pair) for pair in combinations(set(sentence_to_lemma), 2)]
	pairs_list.extend(pairs)
	
	sentence_query = """
		MERGE (S:Sentence {id: $id})
		ON CREATE SET S.text = $text

		WITH S
		MERGE (D:Domain {type: $domainType})
		MERGE (C:Class {type: $classType})
		MERGE (S)-[:IN_DOMAIN]->(D)
		MERGE (S)-[:IN_CLASS]->(C)

		WITH S
		UNWIND $lemmas AS lemma
		MATCH (L:Lemma {lemma: lemma})
		MERGE (S)-[r:HAS_LEMMA]->(L)
		ON CREATE SET r.count = 1
		ON MATCH SET r.count = r.count + 1
	"""
	
	sentence_params = {"id": i + 1, "text": sentence, "domainType": domain_type, "classType": class_type, "lemmas": sentence_to_lemma}
	
	execute_query(sentence_query, sentence_params)

In [None]:
# Generating Pairs of Lemmas
lemmas_pairs_query = """
		UNWIND $pairs AS pair
		MATCH (l1:Lemma {lemma: pair[0]})
		MATCH (l2:Lemma {lemma: pair[1]})

		MERGE (l1)-[r1:OCCUR_WITH]->(l2)
		ON CREATE SET r1.count = 1
		ON MATCH SET r1.count = r1.count + 1

		MERGE (l2)-[r2:OCCUR_WITH]->(l1)
		ON CREATE SET r2.count = 1
		ON MATCH SET r2.count = r2.count + 1"""

lemmas_pairs_params = {"pairs": pairs_list}
execute_query(lemmas_pairs_query, lemmas_pairs_params)

# Retrieving Nodes

In [None]:
sentence_query = """MATCH (s:Sentence) RETURN s AS Sentence"""
sentence_records = execute_query(sentence_query)

sentence_nodes = []
for record in sentence_records:
	lemma_node = record['Sentence']
	sentence = {
		"sentence_text": record['Sentence']["text"],
	}
	sentence_nodes.append(sentence)
		
with open("data/json/sentences.json", "w", encoding="utf-8") as f:
		json.dump(sentence_nodes, f, ensure_ascii=False, indent=2)

In [None]:
lemmas_query = """MATCH (l:Lemma) RETURN l AS Lemma"""
lemma_records = execute_query(lemmas_query)

print(lemma_records)

lemma_nodes = []
for record in lemma_records:
	lemma_node = record['Lemma']
	lemma = {
		"lemma": lemma_node["lemma"],
		"pos": lemma_node["pos"],
		"avg_readability": lemma_node["avg_readability"],
		"freq": lemma_node["freq"]
	}
	lemma_nodes.append(lemma)

with open("data/json/lemmas.json", "w", encoding="utf-8") as f:
	json.dump(lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:
sentence_lemma_query = """MATCH (s:Sentence)-[r:HAS_LEMMA]->(l:Lemma) RETURN s AS sentence, r AS relation, l AS lemma"""
sentence_lemma_records = execute_query(sentence_lemma_query)

sentence_lemma_nodes = []
for record in sentence_lemma_records:
	sentence_part = record["sentence"]
	relation_part = record["relation"]
	lemma_part = record["lemma"]
	sentence_lemma = {
		"sentence_text": sentence_part["text"],
		"relation": relation_part.type,
		"lemma": lemma_part["lemma"]
	}
	sentence_lemma_nodes.append(sentence_lemma)

with open("data/json/sentence_lemma.json", "w", encoding="utf-8") as f:
	json.dump(sentence_lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:
lemma_lemma_query = """MATCH (l1:Lemma)-[r:OCCUR_WITH]->(l2:Lemma)
WHERE l1.lemma < l2.lemma 
RETURN l1 AS lemma1, r AS relation, l2 AS lemma2"""
lemma_lemma_records = execute_query(lemma_lemma_query)


lemma_lemma_nodes = []
for record in lemma_lemma_records:
	lemma1_part = record["lemma1"]
	relation_part = record["relation"]
	lemma2_part = record["lemma2"]
	lemma_lemma = {
		"lemma1": lemma1_part["lemma"],
		"relation": relation_part.type,
		"count": relation_part["count"],
		"lemma2": lemma2_part["lemma"]
	}
	lemma_lemma_nodes.append(lemma_lemma)

with open("data/json/lemma_lemma.json", "w", encoding="utf-8") as f:
	json.dump(lemma_lemma_nodes, f, ensure_ascii=False, indent=2)

In [None]:
sentence_class_query = """MATCH (s:Sentence)-[r:IN_CLASS]->(c:Class) RETURN s AS sentence, r AS relation, c AS class"""
sentence_class_records = execute_query(sentence_class_query)

sentence_class_nodes = []
for record in sentence_class_records:
	sentence_part = record["sentence"]
	relation_part = record["relation"]
	class_part = record["class"]
	sentence_class = {
		"sentence_text": sentence_part["text"],
		"relation": relation_part.type,
		"class_type": class_part["type"]
	}
	sentence_class_nodes.append(sentence_class)
with open("data/json/sentence_class.json", "w", encoding="utf-8") as f:
	json.dump(sentence_class_nodes, f, ensure_ascii=False, indent=2)