In [1]:
import requests
from bs4 import BeautifulSoup
import json
import random

In [2]:
def crawling(word):
    url = "https://relatedwords.org/relatedto/" + word
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "html.parser")
    related_word_json = str(soup.find("script", id="preloadedDataEl")).replace('<script id="preloadedDataEl" type="text/json">', "").replace("</script>", "")
    related_word_json = json.loads(related_word_json)
    related_word = [i["word"] for i in related_word_json["terms"]]
    return related_word
    

crawling("computer")

['calculator',
 'machine',
 'analog computer',
 'computing',
 'personal computer',
 'programmer',
 'central processing unit',
 'computer science',
 'peripheral',
 'hardware',
 'mainframe',
 'microprocessor',
 'laptop',
 'vacuum tube',
 'machine code',
 'computer program',
 'server',
 'computer programming',
 'memory',
 'integrated circuit',
 'turing machine',
 'charles babbage',
 'abacus',
 'electronics',
 'astrolabe',
 'cpu',
 'information',
 'home computer',
 'computer hardware',
 'internet',
 'pc',
 'slide rule',
 'computation',
 'supercomputer',
 'transistor',
 'turing-complete',
 'floppy disk',
 'imac',
 'compiler',
 'arithmetic',
 'processor',
 'john von neumann',
 'keyboard',
 'data',
 'monitor',
 'predictor',
 'digital computer',
 'digital communication',
 'bit',
 'data converter',
 'diskette',
 'control flow',
 'james thomson',
 'mechanical computer',
 'number cruncher',
 'visual display unit',
 'computer circuit',
 'chip',
 'information age',
 'expansion slot',
 'input device

In [127]:
topic_modeling = {}

In [142]:
topic = ["Computer Science", "Social", "Science", "Math", "Sports", "Art", "Music", "Economy", "Physics"]
for i in topic:
    search_word = [i]
    topic_modeling[i] = []

    cnt = 0
    while search_word:
        related_word = crawling(search_word[0])

        if len(related_word) > 20:
            topic_modeling[i].extend(related_word)

            if cnt <= 3:
                search_word.extend(random.sample(related_word, 20))


        del search_word[0]
        cnt += 1

In [143]:
topic_modeling["Computer Science"]

['algorithm',
 'mathematics',
 'computer engineering',
 'science',
 'ibm',
 'artificial intelligence',
 'peter j. denning',
 'code',
 'computer',
 'automata theory',
 'theory of computation',
 'computer architecture',
 'computational problem',
 'computer vision',
 'software engineering',
 'computational learning theory',
 'models of computation',
 'computational geometry',
 'logic',
 'ada lovelace',
 'mechanical calculator',
 'information',
 'integrated circuit',
 'abacus',
 'abstraction',
 'algorithmics',
 'machine',
 'numerical analysis',
 'computing',
 'eniac',
 'computation',
 'arithmometer',
 'information theory',
 'computer programming',
 'information processing',
 'control flow',
 'inter-process communication',
 'transistor',
 'practical disciplines',
 'mosfet',
 'computer graphics',
 'programming language theory',
 'complex systems',
 'human–computer interaction',
 'ubiquitous',
 'computer accessibility',
 'heuristic',
 'automated planning and scheduling',
 'digital computer',


In [144]:
import pandas as pd

In [145]:
word_list = []
topic_list = []

In [146]:
topics = topic_modeling.keys()
for t in topics:
    word_list.extend(topic_modeling[t])
    topic_list.extend([t for _ in range(len(topic_modeling[t]))])

In [147]:
topic_modeling_df = pd.DataFrame(zip(word_list, topic_list), columns=["word", "topic"])

In [148]:
topic_modeling_df

Unnamed: 0,word,topic
0,algorithm,Computer Science
1,mathematics,Computer Science
2,computer engineering,Computer Science
3,science,Computer Science
4,ibm,Computer Science
...,...,...
270497,third council of constantinople,Physics
270498,st. theodore the studite,Physics
270499,notitiae episcopatuum,Physics
270500,lampsacus treasure,Physics


In [149]:
topic_modeling_df.to_csv("./topic_modeling.csv")

In [1]:
import pandas as pd

In [2]:
topic_modeling_df = pd.read_csv("./topic_modeling.csv")
topic_modeling_df.drop("Unnamed: 0", axis=1, inplace=True)
topic_modeling_df.head()

Unnamed: 0,word,topic
0,algorithm,Computer Science
1,mathematics,Computer Science
2,computer engineering,Computer Science
3,science,Computer Science
4,ibm,Computer Science


In [3]:
algorithm = topic_modeling_df[topic_modeling_df["word"] == "algorithm"].to_numpy()
print(algorithm)
print(len([i for i in algorithm if i[1] == "Computer Science"]))

[['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Computer Science']
 ['algorithm' 'Science']
 ['algorithm' 'Science']
 ['algorithm' 'Science']
 ['algorithm' 'Math']
 ['algorithm' 'Math']
 ['algorithm' 'Math']
 ['algorithm' 'Math']
 ['algorithm' 'Math']
 ['algorithm' 'Sports']
 ['algorithm' 'Art']]
8


In [4]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [6]:
topic_count = {"Computer Science": 0, "Social": 0, "Science": 0, "Math": 0, "Sports": 0, "Art": 0, "Music": 0, "Economy": 0, "Physics": 0}
topics = topic_count.keys()
sentence = input("Input: ")
words = [i[0] for i in pos_tag(word_tokenize(sentence)) if i[1] in ["NN", "NNP"]]
for word in words:
    topic_word = topic_modeling_df[topic_modeling_df["word"] == word].to_numpy()
    
    for topic in topics:
        topic_count[topic] += len([i for i in topic_word if i[1] == topic])

print(topic_count)
topic = sorted(topic_count.items(), key=lambda x: x[1], reverse=True)

for i in range(len(topics)):
    print(f"{i+1}. {topic[i][0]}: {topic[i][1] / sum([i[1] for i in topic]) * 100}")

Input: Neuralink's technology, to be clear, looks awesome. It's a brain implant the size of four dollar coins with more than 1,000 electrodes that will (someday) allow a person to wirelessly send neuroelectrical activity to anything digital, from prosthetic arms to Tesla autopilots to memory-recording cloud servers.
{'Computer Science': 9, 'Social': 18, 'Science': 37, 'Math': 17, 'Sports': 4, 'Art': 11, 'Music': 11, 'Economy': 27, 'Physics': 2}
1. Science: 27.205882352941174
2. Economy: 19.852941176470587
3. Social: 13.23529411764706
4. Math: 12.5
5. Art: 8.088235294117647
6. Music: 8.088235294117647
7. Computer Science: 6.61764705882353
8. Sports: 2.941176470588235
9. Physics: 1.4705882352941175


In [20]:
search_word = ["Person"]
person_word = []

cnt = 0
while search_word:
    related_word = crawling(search_word[0])

    if len(related_word) > 20:
        person_word.extend(related_word)

        if cnt <= 3:
            search_word.extend(random.sample(related_word, 20))


    del search_word[0]
    cnt += 1
    
person_word

['people',
 'someone',
 'child',
 'adult',
 'worker',
 'individual',
 'reason',
 'personality',
 'subject',
 'philosophy',
 'anyone',
 'man',
 'woman',
 'one',
 'patient',
 'victim',
 'language',
 'female',
 'mammal',
 'primate',
 'somebody',
 'soul',
 'mortal',
 'chimpanzee',
 'orangutan',
 'brain',
 'native',
 'self',
 'applicant',
 'inhabitant',
 'animal',
 'appointee',
 'else',
 'everyone',
 'thing',
 'nobody',
 'she',
 'morality',
 'face',
 'black',
 'white',
 'gorilla',
 'property',
 'being',
 'leader',
 'friend',
 'legs',
 'family',
 'idea',
 'emotion',
 'community',
 'human',
 'pelvis',
 'another',
 'customs',
 'life',
 'thought',
 'not',
 'any',
 'how',
 'couple',
 'having',
 'good',
 'art',
 'literature',
 'guy',
 'music',
 'organism',
 'employee',
 'creature',
 'user',
 'entity',
 'player',
 'type',
 'those',
 'law',
 'character',
 'customer',
 'genetics',
 'hominid',
 'chad',
 'car',
 'europe',
 'hunter-gatherer',
 'war',
 'death',
 'extinct',
 'species',
 'latin',
 'order'

In [19]:
import pandas as pd
import numpy as np

In [13]:
topic_modeling_df = pd.read_csv("./topic_modeling.csv").to_numpy()
topic_modeling_df

array([[0, 'algorithm', 'Computer Science'],
       [1, 'mathematics', 'Computer Science'],
       [2, 'computer engineering', 'Computer Science'],
       ...,
       [270499, 'notitiae episcopatuum', 'Physics'],
       [270500, 'lampsacus treasure', 'Physics'],
       [270501, 'titular see', 'Physics']], dtype=object)

In [21]:
for i in range(len(person_word)):
    person_word[i] = np.array([270501 + i + 1, person_word[i], "Person"])
person_word

[array(['270502', 'people', 'Person'], dtype='<U11'),
 array(['270503', 'someone', 'Person'], dtype='<U11'),
 array(['270504', 'child', 'Person'], dtype='<U11'),
 array(['270505', 'adult', 'Person'], dtype='<U11'),
 array(['270506', 'worker', 'Person'], dtype='<U11'),
 array(['270507', 'individual', 'Person'], dtype='<U11'),
 array(['270508', 'reason', 'Person'], dtype='<U11'),
 array(['270509', 'personality', 'Person'], dtype='<U11'),
 array(['270510', 'subject', 'Person'], dtype='<U11'),
 array(['270511', 'philosophy', 'Person'], dtype='<U11'),
 array(['270512', 'anyone', 'Person'], dtype='<U11'),
 array(['270513', 'man', 'Person'], dtype='<U11'),
 array(['270514', 'woman', 'Person'], dtype='<U11'),
 array(['270515', 'one', 'Person'], dtype='<U11'),
 array(['270516', 'patient', 'Person'], dtype='<U11'),
 array(['270517', 'victim', 'Person'], dtype='<U11'),
 array(['270518', 'language', 'Person'], dtype='<U11'),
 array(['270519', 'female', 'Person'], dtype='<U11'),
 array(['270520', '

In [22]:
person_word[0].shape

(3,)

In [26]:
topic_modeling_df = np.append(topic_modeling_df, person_word)
# for i in person_word:
#     topic_modeling_df = np.append(topic_modeling_df, i, axis=0)

In [28]:
topic_modeling_df.shape

(922506,)

In [37]:
topic_modeling_df = topic_modeling_df.reshape(topic_modeling_df.shape[0] // 3, 3)

In [38]:
topic_modeling_df = pd.DataFrame(topic_modeling_df, columns=["Indexs", "word", "topic"])
topic_modeling_df

Unnamed: 0,Indexs,word,topic
0,0,algorithm,Computer Science
1,1,mathematics,Computer Science
2,2,computer engineering,Computer Science
3,3,science,Computer Science
4,4,ibm,Computer Science
...,...,...,...
307497,307497,job house,Person
307498,307498,sell off,Person
307499,307499,merchant bank,Person
307500,307500,grey market,Person


In [41]:
topic_modeling_df.drop("Indexs", axis=1, inplace=True)

In [42]:
topic_modeling_df

Unnamed: 0,word,topic
0,algorithm,Computer Science
1,mathematics,Computer Science
2,computer engineering,Computer Science
3,science,Computer Science
4,ibm,Computer Science
...,...,...
307497,job house,Person
307498,sell off,Person
307499,merchant bank,Person
307500,grey market,Person


In [43]:
topic_modeling_df.to_csv("./topic_modeling.csv")