In [1]:
!pip install kaggle transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Colle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !mkdir /content/drive/MyDrive/ontology
# !cp /content/drive/MyDrive/ontology/kaggle/kaggle.json ~/.kaggle

In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/ontology/kaggle/kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
%cd /content/drive/MyDrive/ontology

In [None]:
# !mkdir arx

In [None]:
# !unzip arxiv.zip -d arx

In [None]:
# !kaggle datasets download -d Cornell-University/arxiv

Data analysis and preparation

In [None]:
import pandas as pd
import json
import logging
import dask.bag as db
from typing import Generator, List, Tuple, Optional, Any
import spacy
import nltk
import itertools
from nltk.corpus import stopwords
import pandas as pd
import networkx as nx
from sklearn.cluster import AgglomerativeClustering
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import torch
from nltk import FreqDist

logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

In [None]:
# docs = db.read_text('arx/arxiv-metadata-oai-snapshot.json').map(json.loads)


In [None]:
# docs.count().compute()

In [None]:
PATH_ = "arx/arxiv-metadata-oai-snapshot.json"
NUM_PAPERS = 5000

def get_dataset_generator(path: str) -> Generator:
    with open(path, "r") as fp:
        for line in fp:
            row = json.loads(line)
            yield row

dataset_generator = get_dataset_generator(
    path=PATH_
)
print(type(dataset_generator))

In [None]:
def create_dataframe(generator: Generator) -> pd.DataFrame:
    # I'll use this column to filter out paper duplicates.
    titles = []
    authors = []

    abstracts = []
    categories = []
    dates = []

    for row in generator:
        if len(abstracts) == NUM_PAPERS:
            break

        titles.append(row["title"])
        authors.append(row["authors"])

        dates.append(row["update_date"])
        abstracts.append(row["abstract"])
        categories.append(row["categories"])

    return pd.DataFrame.from_dict({
        "title": titles,
        "authors": authors,
        "date": dates,
        "abstract": abstracts,
        "categories": categories
    })


dataset_df = create_dataframe(dataset_generator)
dataset_df["date"] = pd.to_datetime(dataset_df["date"])

In [None]:
dataset_df.head()

In [None]:
dataset_df.loc[0]

In [None]:
df = dataset_df.copy()

#Data preparation

## 1.1 Stopword filtering using spaCy and NLTK stopwords


In [None]:
nlp = spacy.load("en_core_web_sm")

nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

df['abstract'] = df['abstract'].apply(lambda text: " ".join([word for word in text.split() if word.lower() not in stop_words]))


## 1.2 POS tagging using huggingface pretrained POS tagger

In [None]:



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.do_word_tokenize = True
model = AutoModelForTokenClassification.from_pretrained(model_name)
pos_tagger = pipeline("token-classification", model=model, tokenizer=tokenizer, device=device)

df['abstract_tokens'] = df['abstract'].apply(lambda text: pos_tagger(text))
df['word'] = df['abstract_tokens'].apply(lambda tokens: [token['word'] for token in tokens])
df['pos_token'] = df['abstract_tokens'].apply(lambda tokens: [token['entity'] for token in tokens])
df = df.explode('word').explode('pos_token')

df.reset_index(drop=True, inplace=True)


In [None]:
df.to_csv('tokenized_df_dump.csv')

## 1.3. Frequency analysis

In [None]:
all_words = [word for tokens in df['abstract_tokens'] for word in tokens]
fdist = FreqDist(all_words)

## 1.4. Key terms extraction


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

df['word_frequencies'] = df.groupby('word')['word'].transform('count')

whole_dataset_word_counts = Counter([word for text in df['abstract'] for word in text.split()])
df['word_frequencies_in_dataset'] = df['word'].map(whole_dataset_word_counts)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['abstract'])
df['tfidf'] = [tfidf_matrix[i].toarray()[0] for i in range(len(df))]

df['keywords'] = df.apply(lambda row: [word for word, tfidf_score in zip(row['word'].split(), row['tfidf']) if tfidf_score > 0.5], axis=1)

df.reset_index(drop=True, inplace=True)

#HASTI

## 2.1. Extract synonyms using WordNet and Wiktionary


In [None]:
import nltk
from nltk.corpus import wordnet
import spacy
import numpy as np
from collections import Counter
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []

    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.append(lemma.name())

    return synonyms



## 2.2. Calculating Relations with context similarity between concepts


In [None]:
def calculate_similarity(concept1, concept2):
    doc1 = nlp(concept1)
    doc2 = nlp(concept2)
    similarity = doc1.similarity(doc2)

    return similarity

## 2.3. Extract various pre-versions of Domain Corpuses based on different thresholds


In [None]:
def extract_corpus_with_threshold(data, threshold):
    corpus = defaultdict(list)

    for index, row in data.iterrows():
        abstract = row['abstract']
        keywords = row['keywords']

        for keyword in keywords:
            synonyms = get_synonyms(keyword)

            for syn in synonyms:
                similarity = calculate_similarity(keyword, syn)

                # Extract concepts based on threshold similarity
                if similarity >= threshold:
                    corpus[keyword].append(abstract)

    return corpus


In [None]:
corpuses = []
for i in range(10):
  threshold = i/10
  corpuses.append(extract_corpus_with_threshold(df, threshold))

## 3.1-2 Calculate entropy and information gain between different pre versions of Domain Corpuses and choose most appropriate Corpus and Calculate information gain during feeding different documents to ensure
viability of the current corpus

In [None]:
def calculate_entropy(corpus):
    total_documents = sum(len(docs) for docs in corpus.values())
    entropy = 0.0

    for concept, docs in corpus.items():
        probability = len(docs) / total_documents
        entropy -= probability * np.log2(probability)

    return entropy

def calculate_information_gain(corpus_before, corpus_after):
    entropy_before = calculate_entropy(corpus_before)
    entropy_after = calculate_entropy(corpus_after)
    information_gain = entropy_before - entropy_after
    return information_gain


def calculate_all_information_gains(corpuses):
    all_information_gains = []
    permutations = itertools.permutations(corpuses, 2)

    for corpus1, corpus2 in permutations:
        information_gain = calculate_information_gain(corpus1, corpus2)
        all_information_gains.append((corpus1, corpus2, information_gain))

    return all_information_gains

all_information_gains = np.array(calculate_all_information_gains(corpuses))

corpus_mvp = corpuses[np.argmax(all_information_gains,axis=0)]


### 4.1. Based on hierarchical clustering evaluate different concept groups


In [None]:
G = nx.Graph()

def hierarchical_clustering(corpus, num_clusters):
    clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    cluster_labels = clustering.fit_predict(corpus)
    return cluster_labels

num_clusters = 5
cluster_labels = hierarchical_clustering(corpus, num_clusters)

for i, label in enumerate(cluster_labels):
    G.add_node(i, cluster=label)




## # 4.2. Calculate most related words to this concept hierarchy level


In [None]:
def calculate_related_words(cluster_label, corpus, threshold):
    related_words = []

    for word, docs in corpus.items():
        word_cluster_labels = [G.nodes[i]['cluster'] for i in range(len(corpus[word]))]

        if cluster_label in word_cluster_labels:
            cluster_count = word_cluster_labels.count(cluster_label)
            total_count = len(word_cluster_labels)
            cluster_proportion = cluster_count / total_count

            if cluster_proportion >= threshold:
                related_words.append(word)

    return related_words

target_cluster_label = 0  #
threshold = 0.5
related_words = calculate_related_words(target_cluster_label, corpus, threshold)
print(f"Related words for cluster {target_cluster_label}: {related_words}")

# PROMINE

## 2.1; 2.3. Extract features from document, sentence, and word level


In [None]:
nlp = spacy.load("en_core_web_sm")

abstracts = df['abstract']

document_features = []
sentence_features = []
word_features = []

for abstract in abstracts:
    doc = nlp(abstract)

    doc_features = {
        "num_sentences": len(list(doc.sents)),
        "num_words": len(doc),
    }
    document_features.append(doc_features)

    sentence_features_per_abstract = []
    for sent in doc.sents:
        sent_features = {
            "num_tokens": len(sent),
            "sentence_text": sent.text,
        }
        sentence_features_per_abstract.append(sent_features)
    sentence_features.append(sentence_features_per_abstract)

    word_features_per_abstract = []
    for token in doc:
        word_features = {
            "word_text": token.text,
            "word_pos": token.pos_,
            "word_dep": token.dep_,
        }
        word_features_per_abstract.append(word_features)
    word_features.append(word_features_per_abstract)



# 2.2. Provide dataset based on the word level, but containing upper structure as attributes


In [None]:
word_level_dataset = []
for i, abstract in enumerate(abstracts):
    for j, token_features in enumerate(word_features[i]):
        word_level_dataset.append({
            "abstract_text": abstract,
            "sentence_text": sentence_features[i][j]["sentence_text"],
            "word_text": token_features["word_text"],
            "word_pos": token_features["word_pos"],
            "word_dep": token_features["word_dep"],
        })

word_level_df = pd.DataFrame(word_level_dataset)


## 3.1 Based on input-ontology apply current rules on the sentence structures and morphological analysis

In [None]:
def extract_conceptual_relational_knowledge(sentence, ontology):
        for concept, synonyms in ontology.items():
            if word in synonyms:
                knowledge.append((concept, word))
    return knowledge

extracted_knowledge = []
for abstract in df['abstract']:
    doc = nlp(abstract)

    for sentence in doc.sents:
        sentence_text = sentence.text
        knowledge = extract_conceptual_relational_knowledge(sentence_text, ontology)
        extracted_knowledge.append((sentence_text, knowledge))



## 3.2. Create pre-ontologic-concepts (Ontels) based on this analysis


In [None]:
ontels = {}
for sentence, knowledge in extracted_knowledge:
    for concept, word in knowledge:
        if concept not in ontels:
            ontels[concept] = set()
        ontels[concept].add(word)

for concept, words in ontels.items():
    print(f"Ontel: {concept}")
    print(f"Words: {', '.join(words)}")
    break()

## 4.1. Apply lexicon manager

In [None]:
def apply_lexicon_manager(ontology, new_terms):
    # Add new terms to the ontology or update existing synonyms
    for concept, synonyms in new_terms.items():
        if concept not in ontology:
            ontology[concept] = []
        ontology[concept].extend(synonyms)

apply_lexicon_manager(ontology, new_terms_to_add)

## 4.2. Apply ontology manager (is not provided here yet)

In [None]:
print("Updated Ontology:")
for concept, synonyms in ontology.items():
    print(f"Concept: {concept}, Synonyms: {', '.join(synonyms)}")
    break()