# 1. Read science excel sheet for text IDs
- each sheet represents one corpus genre (Magazine, Newspaper, Academic)
- all entries(row) are specfications on sci-related texts 
- the actual text data is stored in a different folder and will be matched with their IDs in step 2

In [1]:
import xlrd
workbook = xlrd.open_workbook(r'D:\COCA & GloWbE\COCA\COCA Science Texts.xlsx')
# Print out sheet name
for i, sheet in enumerate(workbook.sheets()):
    print("Sheet",i+1 ,":" , sheet.name)

Sheet 1 : Magazine
Sheet 2 : Academic
Sheet 3 : Newspaper


The excel sheet is already sorted for science-related texts, so we only need the text IDs (1st column from each sheet)

In [121]:
ID_to_title_list = []
for sheet in workbook.sheets():
    
    # List of Xlrd.cells
    first_col = sheet.col(0) 
    sixth_col = sheet.col(5) 
    
    # Change cells to string, index from [1:] to skip header
    first_col = [str(cell) for cell in first_col[1:]]
    sixth_col = [str(cell) for cell in sixth_col[1:]]
    
    # Zip two columns and store item as list in a temporal list
    ID_title_pair = [list(item) for item in zip( first_col, sixth_col ) ] 

    # append list of lists to main list
    ID_to_title_list.append( ID_title_pair ) 
    
    

Resulting data structure is a list of list, as there are three sheets<br>
each item is a length 2 list with text ID & title

In [108]:
print(type(ID_to_title_list ))
ID_to_title_list 

<class 'list'>


[[['number:2024318.0', "text:'Crime bytes back. (cover story)'"],
  ['number:2024319.0', "text:'Trouble in paradise.'"],
  ['number:2024320.0', "text:'The catch.'"],
  ['number:2024322.0', "text:'Once upon a time in America.'"],
  ['number:2024278.0', "text:'The big ear. (cover story)'"],
  ['number:2024279.0', "text:'Snapshots from the butterfly plague.'"],
  ['number:2024280.0', "text:'Field of dreams? (cover story)'"],
  ['number:2024282.0', "text:'Stories from the freedom trail.'"],
  ['number:2024284.0', "text:'The dead love you.'"],
  ['number:2024286.0',
   'text:"What the government isn\'t saying about UFO\'s. (cover story)"'],
  ['number:2024288.0', "text:'The alien almanac. (cover story)'"],
  ['number:2024325.0', "text:'Voyage to a far planet. (cover story)'"],
  ['number:2024327.0', 'text:"Einstein\'s Law."'],
  ['number:2024329.0', "text:'Latter-day Martian chronicles. (cover story)'"],
  ['number:2024331.0', "text:'Mosquito.'"],
  ['number:2024290.0', "text:'Making fun. (

#### Because the data was extracted from Excel, xlrd.cell returns data with redundant labels:
The text ID (number) and its' corresponding title (text)<br>
We filter out the data with regEx

In [109]:
print( # List of list
    type(ID_to_title_list [0]), "\n",
           ID_to_title_list[0][0:4], "\n\n",
       # list inside sublists
      type(ID_to_title_list [0][0]), "\n",
           ID_to_title_list[0][0])

<class 'list'> 
 [['number:2024318.0', "text:'Crime bytes back. (cover story)'"], ['number:2024319.0', "text:'Trouble in paradise.'"], ['number:2024320.0', "text:'The catch.'"], ['number:2024322.0', "text:'Once upon a time in America.'"]] 

 <class 'list'> 
 ['number:2024318.0', "text:'Crime bytes back. (cover story)'"]


#### The pattern we are trying to remove is the bold & italic part:
- [ ***number:***2024322***.0***, ***text:'***Once upon a time in America.***'*** ]<br> 
- note the apostrophes in the second item

In [122]:
import re

pattern1 = re.compile(r'number:')
pattern2 = re.compile(r'[.]0')
pattern3 = re.compile(r"text:[']")

for subList in ID_to_title_list:
    
    for item in subList:
            
        item[0] = pattern1.sub('', str(item[0]))
        item[0] = pattern2.sub('', str(item[0]))
        
        item[1] = pattern3.sub('', str(item[1]))
        item[1] = item[1][:-1] # to drop the additional apostrophe at the end

In [123]:
ID_to_title_list

[[['2024318', 'Crime bytes back. (cover story)'],
  ['2024319', 'Trouble in paradise.'],
  ['2024320', 'The catch.'],
  ['2024322', 'Once upon a time in America.'],
  ['2024278', 'The big ear. (cover story)'],
  ['2024279', 'Snapshots from the butterfly plague.'],
  ['2024280', 'Field of dreams? (cover story)'],
  ['2024282', 'Stories from the freedom trail.'],
  ['2024284', 'The dead love you.'],
  ['2024286',
   'text:"What the government isn\'t saying about UFO\'s. (cover story)'],
  ['2024288', 'The alien almanac. (cover story)'],
  ['2024325', 'Voyage to a far planet. (cover story)'],
  ['2024327', 'text:"Einstein\'s Law.'],
  ['2024329', 'Latter-day Martian chronicles. (cover story)'],
  ['2024331', 'Mosquito.'],
  ['2024290', 'Making fun. (cover story)'],
  ['2024292', 'Tower of Babylon.'],
  ['2024293', 'Truth & consequences.'],
  ['2024295', 'The mechanics of mysticism.'],
  ['2024297', 'Apres moi.'],
  ['2024301',
   'text:"Why can\'t a man be more like a woman...and vice ver

- text_IDs[0] = Magazine
- text_IDs[1] = Academic
- text_IDs[2] = Newspaper

# 2. Finding matching texts using their IDs

In [124]:
mag_dir = "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Doc2Vec Docs - Mag\\"
acad_dir = "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Doc2Vec Docs - Acad\\"
news_dir = "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Doc2Vec Docs - News\\" 

directory_list = [mag_dir, acad_dir, news_dir]

destination_dir = "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\" 


In [132]:
import re
import os

file_number = 1

for num, directory in enumerate(directory_list): # loops three times (three dirs)
    
    # Get according sheet
    ID_list = [item[0] for item in ID_to_title_list[num] ]
    title_list = [item[1] for item in ID_to_title_list[num] ]
    
    # title name
    if num == 0:
        file_genre = "mag_"
    elif num == 1:
        file_genre = "acad_"
    elif num == 2: 
        file_genre = "news_"
    
    
    for file in os.listdir(directory): # chekcs all file in each dir
        
        with open(directory + file) as f: # better than simple open() & close(), as it accounts for errors and f.close() on its own 
            text = f.read()
        ID_of_text = text[3:10] # get the ID numbers, which are at the front part of a text file
        
        for index, ID in enumerate(ID_list): 
            if (ID_of_text == ID): # check for ID in ID list

                text = text[10:] # rids the ID number
                title = "%&% "+ title_list[index] + "%&%" # title with delimters
                text = title + text # title at the beginning

                with open(destination_dir + file_genre + index + '.txt', 'w') as new_file:
                    new_file.write(text)
                    file_number +=1

                if (file_number % 500 == 0):
                    print(file_number, "files written" )        

500 files written
1000 files written
1500 files written


FileNotFoundError: [Errno 2] No such file or directory: "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\textWhen Empires Strike Back [Corrected 021204]; Here's hoping this month's release of the Hollywood sea-fighting epic Master and Commander will do justice to those magnificent men and their sailing machines. On these pages, the mightiest ships of then and now..txt"