In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

import json
import csv
import urllib.request

from selenium.webdriver.firefox.options import Options

some useful links on scraping with selenium:
https://selenium-python.readthedocs.io/locating-elements.html
https://www.scrapingbee.com/blog/selenium-python/

In [2]:
csv_filename = 'tactile_books.csv'

In [3]:
#Create empyt csv file with dictionary headers before running the scraping 
#(Make sure not to run this with the same filename as it will erase the old file)

with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ['title','subtitle','date','txt','gif','pdf','openURL', 'index']
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()
    csvfile.close()

In [4]:
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium import webdriver

options = Options()
options.headless = False

firefox_binary_path = 'C:/Program Files/Mozilla Firefox/firefox.exe'  # Update with the correct path to your Firefox binary

firefox_binary = FirefoxBinary(firefox_binary_path)
driver = webdriver.Firefox(firefox_binary=firefox_binary, executable_path='D:\APP\geckodriver-v0.33.0-win32\geckodriver', options=options)
driver.get('https://archive.org/details/texts?query=tactile&and[]=mediatype%3A%22texts%22&and[]=languageSorter%3A%22English%22&and[]=lending___status%3A%22is_readable%22')

In [5]:
for i in range(20):
    print(i)
    
    books = driver.find_elements(By.CLASS_NAME, 'item-ttl.C.C2')
    book_links = [b.find_element(By.TAG_NAME, 'a').get_attribute('href') for b in books][-75:]


    for b in book_links:
        driver.get(b)

        try:
            downloads = driver.find_element(By.PARTIAL_LINK_TEXT, 'SHOW ALL')
            downloads.click()
        except:
            downloads = False

        if downloads:
            try:
                txt = driver.find_element(By.PARTIAL_LINK_TEXT, '.txt').get_attribute('href')
            except:
                txt = None
            try:
                gif = driver.find_element(By.PARTIAL_LINK_TEXT, '.gif').get_attribute('href')
            except:
                gif = None
            try:
                pdf = driver.find_element(By.PARTIAL_LINK_TEXT, '.pdf').get_attribute('href')
            except:
                pdf = None

        driver.back()

        try:
            title = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[4]/div/div/div[2]/h1/span").text
        except:
            title = None
        try:
            subtitle = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[4]/div/div/div[2]/dl/dd/span/a").text
        except:
            subtitle = None
        try:
            date = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[5]/div/div/div[1]/div[2]/dl[1]/dd").text
        except:
            date = None

        iName = b.split('/')[-1]
        index = 'OL'+iName+'.txt'

        book = {'title':title,
                'subtitle': subtitle,
                'date': date,
                'txt': txt,
                'gif': gif,
                'pdf': pdf,
                'openURL':b,
                'index': index
               }

        with open(csv_filename, 'a', newline = '', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
            writer.writerow(book)

        driver.back()
    
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(3)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [6]:
import pprint
import nltk.corpus
import random
import os
import gensim

Gensim tutorial this following part is based on:
https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py

In [7]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\itsfr\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [8]:
read_books = []
csv_filename = 'tactile_books.csv'  # Replace with your filename

with open(csv_filename, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        read_books.append(row)

In [9]:
read_books[1]

{'title': 'Miniaturization of a microcontroller for the tactile situational awareness system.',
 'subtitle': 'Wood, Terrence L.',
 'date': '1999-06-01',
 'txt': 'https://archive.org/download/miniaturizationo00wood/miniaturizationo00wood_djvu.txt',
 'gif': 'https://archive.org/download/miniaturizationo00wood/miniaturizationo00wood.gif',
 'pdf': 'https://archive.org/download/miniaturizationo00wood/miniaturizationo00wood.pdf',
 'openURL': 'https://archive.org/details/miniaturizationo00wood',
 'index': 'OLminiaturizationo00wood.txt'}

In [10]:
len(read_books)

520

In [11]:
import os
import urllib.request

def save_text(book, directory):
    text = ""
    
    # Check if 'txt' field in book is not empty
    if book['txt']:
        for line in urllib.request.urlopen(book['txt']):
            text += str(line)
    else:
        print(f"No 'txt' URL found for book: {book['index']}")
        return

    # Check if directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    f = open(os.path.join(directory, book['index']),"w")
    f.write(text)
    f.close()

In [12]:
def isEnglish(book, word_dictionary):
    filename = os.path.join('books', book['index'])
    
    f = open(filename,'r')
    text = f.read()
    
    words = text.split(' ')
    random.shuffle(words)
    
    english_found = False
    for w in words[:1000]:
        if w in word_dictionary:
            return True
    return False

In [13]:
for b in read_books:
    b = save_text(b, 'books')

In [14]:
read_english_books = []
word_dictionary = nltk.corpus.words.words()
for b in read_books:
    if isEnglish(b, word_dictionary):
        read_english_books.append(b)

In [15]:
print(len(read_books))
print(len(read_english_books))

520
500


In [16]:
combined_paragraphs = []
paragraph_book_index = []

for b in read_english_books:
    filename = os.path.join('books',b['index'])
    f = open(filename, "r")
    text = f.read()
    paragraphs = text.split("b'\\n'")
    paragraphs2 = [x.replace("b'","").replace('b"', "").replace("\\n'","").replace('\\n"',"") for x in paragraphs]
    paragraphs3 = [x.replace("\\", "").replace('^', "") for x in paragraphs2]
    paragraphs4 = [x for x in paragraphs3 if len(x)>100]
    
    for p in paragraphs4:
        combined_paragraphs.append(p)
        paragraph_book_index.append(b['index'])

In [17]:
len(combined_paragraphs)

492960

In [18]:
combined_paragraphs[0]

"This thesis examines the effects of underwater submersion and prolonged underwater submersion on a diver's tactile sensitivity. The method of constant stimulus is used to determine size discrimination thresholds. The stimuli used are squares of hard acrylic plastic into which holes of varying diameters have been drilled. "

In [19]:
stoplist = set('for a of the and to in'.split(' '))

# In the lecture I made an error here by combining these two lines, which added words 
#in a different form in a way they shouldn't

texts = [[word.replace(".","").replace(",","") for word in document.lower().split()] 
         for document in combined_paragraphs]

texts = [[word for word in text if (word not in stoplist and len(word)>2)] 
         for text in texts]

to_delete = []
for i in range(len(texts)):
    t = texts[i]
    test = [w for w in t if w.isalpha()]
    if len(test) < 20:
        to_delete.append(i)
    else:
        texts[i] = test

for i in sorted(to_delete, reverse = True):
    del texts[i]
    del combined_paragraphs[i]
    del paragraph_book_index[i]
    
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus[0])
        

['this',
 'thesis',
 'examines',
 'effects',
 'underwater',
 'submersion',
 'prolonged',
 'underwater',
 'submersion',
 'tactile',
 'sensitivity',
 'method',
 'constant',
 'stimulus',
 'used',
 'determine',
 'size',
 'discrimination',
 'thresholds',
 'stimuli',
 'used',
 'are',
 'squares',
 'hard',
 'acrylic',
 'plastic',
 'into',
 'which',
 'holes',
 'varying',
 'diameters',
 'have',
 'been',
 'drilled']


In [20]:
import pickle

# Save the processed_corpus object to a specific directory
with open('F:/TxtDataset/processed_corpus.pkl', 'wb') as f:
    pickle.dump(processed_corpus, f)

In [21]:
# Load the processed_corpus object from a specific directory
with open('F:/TxtDataset/processed_corpus.pkl', 'rb') as f:
    loaded_corpus = pickle.load(f)