In [6]:
from arango import ArangoClient
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
import json
import os
import random
import numpy as np
nltk.download('punkt')
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Stark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
client = ArangoClient(hosts='http://localhost:8529')
db = client.db('Database_Material_Science', username='root', password='praddy')
entities = db.collection('entities')
publications = db.collection('publications')
edges = db.collection('publications_entities_edges')

In [8]:
from matplotlib import pyplot as plt

### Fetching the Distinct enitity classes in the dataset:

In [9]:
QUERY_get_entity_classes = f"""
                            FOR entity IN entities
                                COLLECT entity_categories = entity.tag
                                return entity_categories
                            """

cursor = db.aql.execute(QUERY_get_entity_classes)
entity_classes = list(cursor)
entity_classes

['APL', 'CHM', 'CMT', 'PRO', 'SPL']

## Fetching all the `Entities` with the class `CHM` meaning Chemical/Material [Fetching all the chemicals in the database]

In [10]:
QUERY_get_chm = f"""
                            FOR entity IN entities
                                filter entity.tag == "CHM"
                                return entity.name
                            """

cursor = db.aql.execute(QUERY_get_chm)
chm_entities = list(cursor)
random.sample(chm_entities, 10)

['H3PO4',
 'Oxygen',
 'AsV',
 'Polymethyl Methacrylate',
 'Glycosyl',
 'Bioethanol',
 'Acetonide',
 'S2O32',
 'Silica',
 'Sodium']

## Fetching all the the `Entities` with the class `PRO` meaning Property [Fetching all the properties in the database]

In [11]:
QUERY_get_pro = f"""
                    FOR entity IN entities
                        filter entity.tag == "PRO"
                        return entity.name
                            """
cursor = db.aql.execute(QUERY_get_pro)
pro_entities = list(cursor)
random.sample(pro_entities, 10)

['Interaction Behavior',
 'Spectral Intensity',
 'Photodynamic Effect',
 'Coefficient',
 'LUMO Distribution',
 'Interface Conductivity',
 'Total Flux',
 'Critical Temperature T C',
 'Twinning Planes',
 'Adiabatic Temperature Change']

In [13]:
unique, counts = np.unique(pro_entities, return_counts=True)
top_indices = np.argsort(counts)[::-1][:2]
top_two_entries = unique[top_indices]
top_two_counts = counts[top_indices]

print("Top 2 occurring entries:")
for entry, count in zip(top_two_entries, top_two_counts):
    print(f"{entry}: {count}")

Top 2 occurring entries:
Г: 1
Failure Strength: 1


## Loading the 1990 dataset from Semantic-KG:

In [14]:
def read_semanticKG(PATH_dataset_dir:str):
    dataset_lines = []
    for file in ['train_norm.txt', 'val_norm.txt']:
        with open(os.path.join(PATH_dataset_dir, file), 'r', encoding='utf-8') as f:
            lines = f.readlines()
            dataset_lines.extend(lines)
    return dataset_lines

In [15]:
PATH_1990 = r"D:\College\Research\Prof_jamshid\Hypothesis_Generation_Active_Learning\datasets\1990"
semantic_KG_1990 = read_semanticKG(PATH_1990)
random.sample(semantic_KG_1990, 10)
print(len(semantic_KG_1990))

10789


In [16]:
random.sample(semantic_KG_1990, 10)

['Verapamil was found to reverse multidrug resistance but had no effect on intracellular pH while amiloride, which acidifies the cytoxol by blocking Na+/H+ antiport activity, did not cause reversal of drug resistance.\n',
 '"l\'Support in both large and small cells.\n',
 'Therefore, from the comparison of the experimental Tafel and decay slopes with the theoretical ones and from the dependence of the cathodic current density, at constant A#, on the (C/C) ratio, it is concluded that among the various mechanisms discussed above, the scheme V with step Vc as rate determining and m = 2 explains most of the kinetic parameters of the electrochemical reduction of the ammonium ion dissolved in DMSO on platinum.The participation of adsorption process explains the low Tafel slopes, of the order of -40 mV, as well as the slopes larger than -120 mV at 25 "C.The latter appear always related to a high degree of surface coverage.When the system is electrolyzed for a long time before kinetic measureme

## Checking how many occurances are there in the 1990 dataset that has `CHM` and `PRO` from `Mat-KG`

In [17]:
count_chm_occurences = 0
count_pro_occurences = 0
chm_pro_together = 0
for sent in semantic_KG_1990:
    both_present = [False, False]
    if any(chm.lower() in sent.lower() for chm in chm_entities):
        count_chm_occurences += 1
        both_present[0] = True
    if any(pro.lower() in sent.lower() for pro in pro_entities):
        count_pro_occurences += 1
        both_present[1] = True
    if both_present[0] and both_present[1]:
        chm_pro_together


In [18]:
print('[INFO] Number of Chm entities: ', count_chm_occurences)
print('[INFO] Number of Pro entities: ', count_pro_occurences)
print('[INFO] Number of entities with both: ', chm_pro_together)

[INFO] Number of Chm entities:  10715
[INFO] Number of Pro entities:  10779
[INFO] Number of entities with both:  0


# The `Hypothesis` Scenario

* After thinking a lot, I'm considering using something related to semi-conductors
* I foound the following terms that are in play when semiconductos are manifactured for any kind of LED:
    - Band Gap Energy
    - Quantum Well
    - Semi-Conductor
    - Wavelength
    - Energy Level
    
## I'll start by listing the candidates of these `Properties` in this dataset:


In [24]:
[pro for pro in pro_entities if pro.lower() in "band gap energy"]

['P',
 'ER',
 'Energy',
 'RG',
 'GAP Energy',
 'Band Gap Energy',
 'Band Gap',
 'AP',
 'Gap',
 'Rg',
 'GaP']

In [25]:
[pro for pro in pro_entities if pro.lower() in "quantum well"]

['T', 'M W', 'Q', 'L', 'U', 'EL', 'NT', 'WEL', 'El']

In [26]:
[pro for pro in pro_entities if pro.lower() in "semi conductor"]

['Tor', 'T', 'Ct', 'MI', 'U', 'Uc', 'OR', 'SE', 'Se', 'SEM', 'Co', 'CT']

In [27]:
[pro for pro in pro_entities if pro.lower() in "wavelength"]

['T', 'Length', 'L', 'EL', 'WA', 'El']

In [28]:
[pro for pro in pro_entities if pro.lower() in "energy level"]

['L', 'ER', 'Energy', 'RG', 'Energy Level', 'EL', 'Rg', 'EV', 'El']

## Searching the same in the dataset

In [38]:
sentences = []
word = "semiconductor"
for sent in semantic_KG_1990:
    if word in sent.lower():
        sentences.append(sent)

sentences

['Electrochem.11, 551 (1981).of Metals and Semiconductors(Edited by M. Froment), pp.\n',
 'Electrochem.11, 551 (1981).of Metals and Semiconductors(Edited by M. Froment), pp.\n',
 'We present Hall Effect and resistivity data which demonstrate that EuB 6 is a degenerate semiconductor transforming into a metal or semimetal below the ferromagnetic ordering temperature, Tc--13.7K.We also report an anomalously large, positive pressure dependence of T c, (1/Tc)(ATc/AP) = 4 x 10 -2 kbar -1 EuB 6 is a cubic material having B 6 octahedra at the corners of a simple cubic lattice and Eu at the body center.It orders ferromagnetlcally with a transition tempera-"\n',
 'We present Hall Effect and resistivity data which demonstrate that EuB 6 is a degenerate semiconductor transforming into a metal or semimetal below the ferromagnetic ordering temperature, Tc--13.7K.We also report an anomalously large, positive pressure dependence of T c, (1/Tc)(ATc/AP) = 4 x 10 -2 kbar -1 EuB 6 is a cubic material havi