# Imports

In [27]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter
import string

import requests as r

import sentence_transformers # sbert

# OpenAlex Data Set Up

In [7]:
selected_papers = [
    'W2769470793',
    'W2968213717',
    'W2789629017',
    'W639708223',
    'W3130804511'
]

selected_concepts = [
    'C119857082',
    'C108583219',
    'C154945302',
    'C2522767166',
    'C41008148'
]

In [16]:
for paper in selected_papers:
    print(r.get(f'https://api.openalex.org/{paper}').json()['title'])

Fake news detection using naive Bayes classifier
Size and mass prediction of almond kernels using machine learning image processing
Residential roof condition assessment system using deep learning
Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks
Detecting URL Phishing Attacks Using Machine Learning & NLP Techniques


In [26]:
for concept in selected_concepts:
    print(r.get(f'https://api.openalex.org/{concept}').json()['display_name'],
    r.get(f'https://api.openalex.org/{concept}').json()['level'],
    r.get(f'https://api.openalex.org/{concept}').json()['works_count'])

Machine learning 1 761448
Deep learning 2 146299
Artificial intelligence 1 4151946
Data science 1 1844470
Computer science 0 41229216


# Processing SBERT

In [28]:
model_sbert = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.13MB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 191kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 5.23MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 306kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 57.2kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 1.02MB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:02<00:00, 31.2MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.4kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 3.35MB/s]
Downloading: 100%|██████████| 350/350 [00:00<00:00, 350kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 6.55MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 2.23MB/s]
Downloading: 100%|██████████| 349/349 [00:00<00:00, 174kB/s]


In [29]:
title_lists = []
for paper in selected_papers:
    title_lists.append(r.get(f'https://api.openalex.org/{paper}').json()['title'])
title_lists

['Fake news detection using naive Bayes classifier',
 'Size and mass prediction of almond kernels using machine learning image processing',
 'Residential roof condition assessment system using deep learning',
 'Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks',
 'Detecting URL Phishing Attacks Using Machine Learning & NLP Techniques']

In [31]:
embeddings = model_sbert.encode(title_lists)
embeddings.shape

(5, 384)

In [35]:
for embed in embeddings:
    print(embed.min(), embed.max())

-0.13894033 0.15323018
-0.14417018 0.15038511
-0.16606373 0.14227939
-0.19193356 0.13964874
-0.18712273 0.19834355


In [37]:
cosine_scores = sentence_transformers.util.cos_sim(embeddings, embeddings)

pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

pairs # sorted by highest similarity (index = ith value in title_lists)

[{'index': [0, 4], 'score': tensor(0.3783)},
 {'index': [2, 3], 'score': tensor(0.2298)},
 {'index': [2, 4], 'score': tensor(0.1727)},
 {'index': [1, 2], 'score': tensor(0.1689)},
 {'index': [1, 4], 'score': tensor(0.1545)},
 {'index': [1, 3], 'score': tensor(0.1331)},
 {'index': [0, 3], 'score': tensor(0.1251)},
 {'index': [0, 1], 'score': tensor(0.0917)},
 {'index': [0, 2], 'score': tensor(0.0835)},
 {'index': [3, 4], 'score': tensor(0.0196)}]

In [47]:
for pair in pairs:
    print('Cosine Similarity: ', pair['score'])
    print(title_lists[pair['index'][0]])
    print(title_lists[pair['index'][1]], '\n')

Cosine Similarity:  tensor(0.3783)
Fake news detection using naive Bayes classifier
Detecting URL Phishing Attacks Using Machine Learning & NLP Techniques 

Cosine Similarity:  tensor(0.2298)
Residential roof condition assessment system using deep learning
Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks 

Cosine Similarity:  tensor(0.1727)
Residential roof condition assessment system using deep learning
Detecting URL Phishing Attacks Using Machine Learning & NLP Techniques 

Cosine Similarity:  tensor(0.1689)
Size and mass prediction of almond kernels using machine learning image processing
Residential roof condition assessment system using deep learning 

Cosine Similarity:  tensor(0.1545)
Size and mass prediction of almond kernels using machine learning image processing
Detecting URL Phishing Attacks Using Machine Learning & NLP Techniques 

Cosine Similarity:  tensor(0.1331)
Size and mass prediction of almond kernels using machine learning image proces

# Count Vectorize