### Steps
1. Tokenization of sentences. Removing punctuations.
2. Create dictionary and mini-corpus using BOW
3. Save generated corpus
4. Compute TF IDF
5. Interpret the results
6. Apply N-Gramming

### Topics
How developers see AI \
How ai is affecting our industry \
AI being used on education

### Saving urls from Google search results

In [18]:
import pprint
from dotenv import dotenv_values
from googleapiclient.discovery import build

env = dotenv_values(".env")

service = build(
    "customsearch", "v1", developerKey=env['DEVELOPER_KEY']
)

offset=0
urls = []
while offset < 100:
    res = (
        service.cse()
        .list(
            q="AI artificial intelligence homeworks assignment education advantages disadvantages pros cons",
            cx=env['SEARCH_ENGINE_ID'],
            num=10,
            start=1+offset
        )
        .execute()
    )
    for item in res['items']:
        urls.append(item['link'])
    offset+=10

In [19]:
print(len(urls))
print(urls[:5])

100
['https://www.nea.org/nea-today/all-news-articles/chatgpt-enters-classroom-teachers-weigh-pros-and-cons', 'https://www.rochester.edu/newscenter/chatgpt-artificial-intelligence-ai-chatbots-education-551522/', 'https://www.forbes.com/sites/theyec/2023/02/21/ai-in-the-classroom-pros-cons-and-the-role-of-edtech-companies/', 'https://elearningindustry.com/pros-and-cons-of-using-ai-in-learning-chatgpt-helping-or-hindering-learning-outcomes', 'https://www.hurix.com/chat-gpt-pros-and-cons-of-using-chatgpt-in-higher-education/']


In [21]:
with open("urls.txt", "w") as file:
    for url in urls:
        file.write(url + "\n")

### Scraping from given urls

In [2]:
urls = []
with open("urls.txt", "r", encoding="utf-8") as file:
    urls = file.readlines()

In [30]:
len(urls)

100

In [5]:
import json

def scrape_url(url, client):
    return client.get(url, 
        params={
            # Block ads on the page you want to scrape	
            'block_ads': True,
            # Block images and CSS on the page you want to scrape	
            'block_resources': True,
            # Control the device the request will be sent from	
            'device': 'desktop',
            # Use some data extraction rules
            'extract_rules': {
                # 'text': 'body'
                'text': {
                    'selector': 'p',
                    'type': 'list'
                }
            },
            # Wrap response in JSON
            'json_response': True,
            # Use premium proxies to bypass difficult to scrape websites (10-25 credits/request)
            'premium_proxy': False,
            # Execute JavaScript code with a Headless Browser (5 credits/request)
            'render_js': True,
            # Return the original HTML before the JavaScript rendering	
            'return_page_source': False,
            # Return page screenshot as a png image
            'screenshot': False,
            # Take a full page screenshot without the window limitation
            'screenshot_full_page': False,
            # Transparently return the same HTTP code of the page requested.
            'transparent_status_code': False,
            # Wait, in miliseconds, before returning the response
            'wait': 0,
            # Wait for CSS selector before returning the response, ex ".title"
            'wait_for': '',
            'wait_browser': 'load',
            # Set the browser window width in pixel
            'window_width': 1080,
            # Set the browser window height in pixel
            'window_height': 720
        },
        headers={
            # Forward custom headers to the target website
            # "key": "value"
        },
        cookies={
            # Forward custom cookies to the target website
            # "name": "value"
        }
    )

def extract_text(json_content):
    content = json.loads(json_content)
    try:
        str = ''.join(content['body']['text'])
    except:
        str = ""
    return str


In [3]:
from dotenv import dotenv_values
from scrapingbee import ScrapingBeeClient

env = dotenv_values(".env")

client = ScrapingBeeClient(api_key=env["API_KEY"])

documents = []
content = ""
for i in range(30):
    response = scrape_url(urls[i].strip(), client)
    str = extract_text(response.content)
    
    print(f"{i}: {len(str)}")

    documents.append(str)

0: 8162
1: 12100
2: 6101
3: 8031
4: 8662
5: 4388
6: 5122
7: 1270
8: 5504
9: 7633
10: 9522
11: 367770
12: 10563
13: 7418
14: 43
15: 7067
16: 28657
17: 9097
18: 1343
19: 4153
20: 7949
21: 0
22: 8831
23: 281890
24: 8653
25: 7302
26: 34986
27: 8107
28: 39072
29: 13152


In [6]:
len(documents[21])

0

In [27]:
fail_response = scrape_url(urls[31].strip(), client)
fail_str = extract_text(response.content)
documents.append(fail_str)
len(fail_str)

8662

In [28]:
for i in range(len(documents)):
    print(f"{i}: {len(documents[i])}")

0: 8162
1: 12100
2: 6101
3: 8031
4: 8662
5: 4388
6: 5122
7: 1270
8: 5504
9: 7633
10: 9522
11: 367770
12: 10563
13: 7418
14: 43
15: 7067
16: 28657
17: 9097
18: 1343
19: 4153
20: 7949
21: 8831
22: 281890
23: 8653
24: 7302
25: 34986
26: 8107
27: 39072
28: 13152
29: 8662


In [31]:
with open("docs.txt", "w", encoding="utf-8") as file:
    for doc in documents:
        file.write(doc + "\n")

### Create mini corpus

In [32]:
documents = []
with open("docs.txt", "r", encoding="utf-8") as file:
    for line in file:
        documents.append(line)

In [75]:
from gensim import corpora
import spacy
from spacy.tokens import Token

nlp = spacy.load('en_core_web_sm')

# Function to check if a token represents a month
def is_month(token):
    # Define a list of month names
    month_names = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
    return token.text.lower() in month_names
# Add the custom attribute to the Token class
Token.set_extension('is_month', getter=is_month, force=True)

texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.is_space and not w.is_currency \
            and not w.like_num and not w.like_email and not w.like_url \
            and not w._.is_month and not len(w.text) <= 2 \
            and w.text[0].isalpha() and w.text[-1].isalpha():
            text.append(w.lemma_)
    texts.append(text)
#texts is a mini-corpus specifically for 
print(texts)

[['OpenAI', 'artificial', 'intelligence', 'research', 'laboratory', 'launch', 'ChatGPT', 'transformative', 'program', 'ChatGPT', 'impact', 'education', 'center', 'heated', 'debate', 'recent', 'survey', 'conduct', 'online', 'learning', 'platform', 'show', 'percent', 'educator', 'feel', 'program', 'job', 'difficult', 'nearly', 'teacher', 'predict', 'life', 'easy', 'hand', 'educator', 'fear', 'program', 'threaten', 'academic', 'integrity', 'encourage', 'new', 'method', 'cheating', 'plagiarism', 'program', 'simplicity', 'accessibility', 'convenience', 'student', 'generate', 'answer', 'homework', 'entire', 'essay', 'claim', 'chatbot', 'writing', 'ChatGPT', 'simple', 'design', 'brainstorm', 'capability', 'appeal', 'educator', 'potential', 'improve', 'education', 'teacher', 'time', 'real', 'impact', 'increase', 'cheating', 'revitalization', 'lesson', 'plan', 'classroom', 'instruction', 'say', 'Cherie', 'Shields', 'high', 'school', 'english', 'teacher', 'Sandy', 'Oregon', 'good', 'way', 'learn

Apply bi-gramming to our mini corpus

In [76]:
import gensim
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
texts

[['OpenAI',
  'artificial_intelligence',
  'research',
  'laboratory',
  'launch',
  'ChatGPT',
  'transformative',
  'program',
  'ChatGPT',
  'impact',
  'education',
  'center',
  'heated',
  'debate',
  'recent',
  'survey',
  'conduct',
  'online',
  'learning',
  'platform',
  'show',
  'percent',
  'educator',
  'feel',
  'program',
  'job',
  'difficult',
  'nearly',
  'teacher',
  'predict',
  'life',
  'easy',
  'hand',
  'educator',
  'fear',
  'program',
  'threaten',
  'academic',
  'integrity',
  'encourage',
  'new',
  'method',
  'cheating',
  'plagiarism',
  'program',
  'simplicity',
  'accessibility',
  'convenience',
  'student',
  'generate',
  'answer',
  'homework',
  'entire',
  'essay',
  'claim',
  'chatbot',
  'writing',
  'ChatGPT',
  'simple',
  'design',
  'brainstorm',
  'capability',
  'appeal',
  'educator',
  'potential',
  'improve',
  'education',
  'teacher',
  'time',
  'real',
  'impact',
  'increase',
  'cheating',
  'revitalization',
  'lesson_p

Creating a BOW representation of the mini-corpus

In [77]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3)
print(dictionary.token2id)

{'ChatGPT': 0, 'New': 1, 'OpenAI': 2, 'Press': 3, 'able': 4, 'academic': 5, 'accessibility': 6, 'accessible': 7, 'accord': 8, 'account': 9, 'acknowledge': 10, 'add': 11, 'address': 12, 'adjust': 13, 'affect': 14, 'age': 15, 'align': 16, 'article': 17, 'ask': 18, 'assignment': 19, 'author': 20, 'available': 21, 'away': 22, 'ban': 23, 'believe': 24, 'biased': 25, 'big': 26, 'body': 27, 'book': 28, 'brainstorm': 29, 'build': 30, 'calculator': 31, 'capability': 32, 'center': 33, 'character': 34, 'chat': 35, 'chatbot': 36, 'chatgpt': 37, 'cheat': 38, 'cheating': 39, 'check': 40, 'choice': 41, 'claim': 42, 'class': 43, 'company': 44, 'compare': 45, 'complete': 46, 'concept': 47, 'concern': 48, 'conduct': 49, 'conversational': 50, 'correctly': 51, 'course': 52, 'create_new': 53, 'creator': 54, 'critical': 55, 'database': 56, 'debate': 57, 'design': 58, 'detail': 59, 'detection': 60, 'determine': 61, 'different': 62, 'difficult': 63, 'discuss': 64, 'easy': 65, 'edit': 66, 'educational': 67, 'e

Using the doc2bow method, which, as the name suggests, helps convert our document to bag-of-words.

In [78]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 16),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 3),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 3),
  (19, 7),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 4),
  (36, 7),
  (37, 6),
  (38, 2),
  (39, 2),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 2),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 3),
  (63, 1),
  (64, 1),
  (65, 4),
  (66, 2),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 2),
  (72, 1),
  (73, 1),
  (74, 9),
  (75, 2),
  (76, 1),
  (77, 6),
  (78, 1),
  (79, 1),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 2),
  (87, 1),
  (88, 3),
  (89, 2),
  (90, 1),
  (91, 1

Saving corpus

In [79]:
corpora.MmCorpus.serialize('./tmp/corpus.mm', corpus)

Converting Bag-of-Words to TF-IDF representation

In [80]:
from gensim import models
tfidf = models.TfidfModel(corpus)

for document in tfidf[corpus]:
       print(document)

[(0, 0.2633043808044327), (1, 0.049718613112184284), (2, 0.04744361414201365), (3, 0.049718613112184284), (4, 0.02166382017195058), (5, 0.05935507599907288), (6, 0.038688600960586815), (7, 0.028540038514451128), (8, 0.038688600960586815), (9, 0.028540038514451128), (10, 0.049718613112184284), (11, 0.034751819222604285), (12, 0.02166382017195058), (13, 0.049718613112184284), (14, 0.04350683240403112), (15, 0.038688600960586815), (16, 0.049718613112184284), (17, 0.016456523800277043), (18, 0.05935507599907288), (19, 0.12639691134670525), (20, 0.038688600960586815), (21, 0.034751819222604285), (22, 0.04350683240403112), (23, 0.04350683240403112), (24, 0.025996806041177458), (25, 0.034751819222604285), (26, 0.028540038514451128), (27, 0.049718613112184284), (28, 0.034751819222604285), (29, 0.04350683240403112), (30, 0.016456523800277043), (31, 0.049718613112184284), (32, 0.014966793889579992), (33, 0.034751819222604285), (34, 0.049718613112184284), (35, 0.1740273296161245), (36, 0.13849517

Highest TFIDF

In [81]:
x = 1
for doc in tfidf[corpus]:
    max_a = None
    max_b = float('-inf')
    for a, b in doc:
        if b > max_b:
            max_a = a
            max_b = b
    word = next(key for key, value in dictionary.token2id.items() if value == a)
    print(f"Doc# {x}: {word} ({a}) {b}")
    x += 1

Doc# 1: writing (209) 0.07116542121302048
Doc# 2: weigh (406) 0.030327185528362093
Doc# 3: valuable (511) 0.04798262450576521
Doc# 4: wave (610) 0.05279177560290786
Doc# 5: workforce (730) 0.044350512635040196
Doc# 6: workload (767) 0.07057311231972106
Doc# 7: traffic (804) 0.07562437959369156
Doc# 8: section (809) 0.160150902835234
Doc# 9: wrong (860) 0.0611475788380617
Doc# 10: visit (921) 0.05111475126442086
Doc# 11: trend (954) 0.051122640418675334
Doc# 12: stream (955) 1.0
Doc# 13: widely (1003) 0.053391887966946716
Doc# 14: virtual_assistant (1027) 0.05567658558207399
Doc# 15: enable (259) 1.0
Doc# 16: workshop (1044) 0.065277730880679
Doc# 17: watch (1087) 0.08335173791617342
Doc# 18: wide_range (1104) 0.04467990755671582
Doc# 19: dilemma (1105) 0.340390330000262
Doc# 20: young (1109) 0.18080848818628315
Doc# 21: tone (1117) 0.04408908123832287
Doc# 22: regardless (1120) 0.05053728102935418
Doc# 23: final (1015) 0.013886323009141418
Doc# 24: keyword (1124) 0.06589784897712136
Do