> **Talk about (1) and outliers**

In [1]:
#import libraries 
import re
import json 
import gzip
import glob
import nltk
import string
import zipfile
import numpy as np
import pandas as pd

from nltk.corpus import brown
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader

# Data Cleaning

In [2]:
# read file        
# fname = "ProQuestDocuments-2020-10-160_2017_1857295624".split("_")[0]
# with open('2010-2020 unformatted/'+fname+'.txt', 'r') as f:
#     print(f.read())

## 1) Categorize on years (1960-1979)

> Since the text iself has no 'year' section, the logic basically follows that the files mentioned in CSV must be only read. This leads to a missing 90 files.

In [3]:
!rm -r CLEANSED/
!mkdir CLEANSED
!mkdir CLEANSED/1950-1959/
!mkdir CLEANSED/1960-1969/
!mkdir CLEANSED/1970-1979/
!mkdir CLEANSED/1980-1989/
!mkdir CLEANSED/1990-1999/
!mkdir CLEANSED/2000-2009/
!mkdir CLEANSED/2010-2019/
!mkdir CLEANSED/2020-2029/
!mkdir CLEANSED/outlier/

In [4]:
content = {}
counter_1960_1969 = 0
counter_1970_1979 = 0
counter_1960_1979_outlier = 0
files = glob.glob('RAW/1960-1979_dump/*.txt')

df = pd.read_csv("METADATA/1960-1979.csv", usecols=['StoreId', 'year'])
id_year = dict(zip(df['StoreId'], df['year']))

for store_id, year in id_year.items():
    for file_id in files:
        file_name = file_id.split("/")[-1]
        if file_name.startswith(str(store_id)):
            with open(file_id, 'r') as f:
                data = f.read()
            if 1960 <= int(year) <= 1969:
                prefix = '1960-1969/'
                counter_1960_1969+=1
            elif 1970 <= int(year) <= 1979:
                prefix = '1970-1979/'
                counter_1970_1979+=1
            else:
                prefix = "outlier/"
                counter_outlier+=1
            
            with open("CLEANSED/"+prefix+file_name, 'w') as file:
                file.write(data)
            break

print("FINISHED")
print("Total no.of files in source directory: {}".format(len(files)))
print("Total no.of files in 1960-1969 directory: {}".format(counter_1960_1969))
print("Total no.of files in 1970-1979 directory: {}".format(counter_1970_1979))
print("Total no.of files in outlier directory: {}".format(counter_1960_1979_outlier))
print("Number of missing files: {}".format(len(files)-(counter_1960_1969+counter_1970_1979+counter_1960_1979_outlier)))

FINISHED
Total no.of files in source directory: 4248
Total no.of files in 1960-1969 directory: 2900
Total no.of files in 1970-1979 directory: 1258
Total no.of files in outlier directory: 0
Number of missing files: 90


## 1980-2020

In [6]:
sections = ('title', 'publication title', 'publication year', 'document url', 'links', 'section', 'publication subject', 'issn', 'copyright', 'abstract', 'publication info', 'last updated', 'place of publication', 'location', 'author', 'publisher', 'identifier / keyword', 'source type', 'proquest document id', 'country of publication', 'language of publication', 'publication date', 'subject', 'database', 'document type')

### 2) Extract text and categorize on years (1980-2009)

In [7]:
content = {}
counter_1980_1989 = 0
counter_1990_1999 = 0
counter_2000_2009 = 0
counter_1980_2009_outlier = 0
files = glob.glob('RAW/1980-2009_raw/*.txt')

for file_name in files:
    with open(file_name, 'r') as f:
        data = ""
        flag = False
        for line in f:
            line = line.lower()
            if line.startswith('proquest document id:'):
                document_id = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('publication year:'):
                year = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('full text:'):
                flag=True
                data=line.split('full text:')[1].replace("\n", "")
            elif line.strip().startswith(sections):
                flag=False
            elif flag:
                data+=line.replace("\n", "")
    content[file_name.split("/")[-1][:-4] + "_" + year + "_" + document_id] = data.strip()

for key, value in content.items():
    file_name, year, document_id = key.split("_")
    if 1980<= int(year) <=1989:
        prefix = '1980-1989/'
        counter_1980_1989+=1
    elif 1990<= int(year) <=1999:
        prefix = '1990-1999/'
        counter_1990_1999+=1
    elif 2000<= int(year) <=2009:
        prefix = '2000-2009/'
        counter_2000_2009+=1
    else:
        prefix = "outlier/"
        counter_1980_2009_outlier+=1

    with open("CLEANSED/"+prefix+file_name+"_"+document_id+'.txt', 'w') as file:
        file.write(value)

print("FINISHED")
print("Total no.of files in source directory: {}".format(len(files)))
print("Total no.of files in 1980-1989 directory: {}".format(counter_1980_1989))
print("Total no.of files in 1990-1999 directory: {}".format(counter_1990_1999))
print("Total no.of files in 2000-2009 directory: {}".format(counter_2000_2009))
print("Total no.of files in outlier directory: {}".format(counter_1980_2009_outlier))
print("Number of missing files: {}".format(len(files)-(counter_1980_1989+counter_1990_1999+counter_2000_2009+counter_1980_2009_outlier)))

FINISHED
Total no.of files in source directory: 5141
Total no.of files in 1980-1989 directory: 1771
Total no.of files in 1990-1999 directory: 1520
Total no.of files in 2000-2009 directory: 1850
Total no.of files in outlier directory: 0
Number of missing files: 0


### 3) Extract text for years 2010-2019

In [9]:
content = {}
counter_2010_2019 = 0
counter_2010_2019_outlier = 0
files = glob.glob('RAW/2010-2019_raw/*.txt')

for file_name in files:
    with open(file_name, 'r') as f:
        data = ""
        flag = False
        for line in f:
            line = line.lower()
            if line.startswith('proquest document id:'):
                document_id = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('publication year:'):
                year = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('full text:'):
                flag=True
                data=line.split('full text:')[1].replace("\n", "")
            elif line.strip().startswith(sections):
                flag=False
            elif flag:
                data+=line.replace("\n", "")
    content[file_name.split("/")[-1][:-4] + "_" + year + "_" + document_id] = data.strip()

for key, value in content.items():
    file_name, year, document_id = key.split("_")
    if 2010<= int(year) <=2019:
        prefix = '2010-2019/'
        counter_2010_2019+=1
    else:
        prefix = "outlier/"
        counter_2010_2019_outlier+=1
        
    with open("CLEANSED/"+prefix+file_name+"_"+document_id+'.txt', 'w') as file:
        file.write(value)
print("FINISHED")
print("Total no.of files in source directory: {}".format(len(files)))
print("Total no.of files in 2010-2019 directory: {}".format(counter_2010_2019))
print("Total no.of files in outlier directory: {}".format(counter_2010_2019_outlier))
print("Number of missing files: {}".format(len(files)-(counter_2010_2019+counter_2010_2019_outlier)))

FINISHED
Total no.of files in source directory: 3012
Total no.of files in 2010-2019 directory: 2852
Total no.of files in outlier directory: 160
Number of missing files: 0


### 4) Extract text for years 2020-2029

In [10]:
content = {}
counter_2020_2029 = 0
counter_2020_2029_outlier = 0
files = glob.glob('RAW/2020-2029_raw/*.txt')

for file_name in files:
    with open(file_name, 'r') as f:
        data = ""
        flag = False
        for line in f:
            line = line.lower()
            if line.startswith('proquest document id:'):
                document_id = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('publication year:'):
                year = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('full text:'):
                flag=True
                data=line.split('full text:')[1].replace("\n", "")
            elif line.strip().startswith(sections):
                flag=False
            elif flag:
                data+=line.replace("\n", "")
    content[file_name.split("/")[-1][:-4] + "_" + year + "_" + document_id] = data.strip()

for key, value in content.items():
    file_name, year, document_id = key.split("_")
    if 2020<= int(year) <=2029:
        prefix = '2020-2029/'
        counter_2020_2029+=1
    else:
        prefix = "outlier/"
        counter_2020_2029_outlier+=1
        
    with open("CLEANSED/"+prefix+file_name+"_"+document_id+'.txt', 'w') as file:
        file.write(value)
print("FINISHED")
print("Total no.of files in source directory: {}".format(len(files)))
print("Total no.of files in 2020-2029 directory: {}".format(counter_2020_2029))
print("Total no.of files in outlier directory: {}".format(counter_2020_2029_outlier))
print("Number of missing files: {}".format(len(files)-(counter_2020_2029+counter_2020_2029_outlier)))

FINISHED
Total no.of files in source directory: 412
Total no.of files in 2020-2029 directory: 412
Total no.of files in outlier directory: 0
Number of missing files: 0


### 5) Upload files to 1950-1959

In [21]:
!ls "CLEANSED/1950-1959" | wc -l

1051


# Exploring the outliers

In [11]:
content = {}
files = glob.glob('RAW/2010-2019_raw/*.txt')

for file_name in files:
    with open(file_name, 'r') as f:
        data = ""
        flag = False
        for line in f:
            line = line.lower()
            if line.startswith('proquest document id:'):
                document_id = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('publication year:'):
                year = "".join(line.replace("\n", "").split(":")[1].split())
            
            if line.startswith('full text:'):
                flag=True
                data=line.split('full text:')[1].replace("\n", "")
            elif line.strip().startswith(sections):
                flag=False
            elif flag:
                data+=line.replace("\n", "")
    content[file_name.split("/")[-1][:-4] + "_" + year + "_" + document_id] = data.strip()
years = []
for key, value in content.items():
    file_name, year, document_id = key.split("_")
    if not 2010<= int(year) <=2019:
        years.append(year)
print("FINISHED")
print(set(years))

FINISHED
{'2009'}


> Moved all the outlier files to the 2000-2009 folder.

# Tokenizing Text Files for all yeas

In [4]:
# Different functions: bigrams,trigrams, convert string to counts, update metadata from csv, data update
def convert_tuple_bigrams(tuples_to_convert):
    """Converts NLTK tuples into bigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        gram_string = f'{first_word} {second_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_tuple_trigrams(tuples_to_convert):
    """Converts NLTK tuples into trigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        third_word = tuple_grams[2]
        gram_string = f'{first_word} {second_word} {third_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_strings_to_counts(string_grams):
    """Converts a Counter of n-grams into a dictionary"""
    counter_of_grams = Counter(string_grams)
    dict_of_grams = dict(counter_of_grams)
    return dict_of_grams

In [5]:
def update_metadata_from_csv():
    """Uses pandas to grab additional metadata fields from a CSV file then adds them to the JSON-L file.
    Unused fields can be commented out."""
    Title = df.loc[identifier, 'Title']
    Publicationtitle = df.loc[identifier, 'Publicationtitle']
    publicationyear = str(df.loc[identifier, 'Publicationyear'])
    DocumentURL = df.loc[identifier, 'DocumentURL']
    Fulltext = df.loc[identifier, 'Fulltext']
    Links = df.loc[identifier, 'Links']
    Section = df.loc[identifier, 'Section']
    Publicationsubject = str(df.loc[identifier, 'Publicationsubject'])
    ISSN = str(df.loc[identifier, 'ISSN'])
    Abstract = df.loc[identifier, 'Abstract']
    Publicationinfo = df.loc[identifier, 'Publicationinfo']
    Lastupdated = df.loc[identifier, 'Lastupdated']
    Placeofpublication = df.loc[identifier, 'Placeofpublication']
    Location = df.loc[identifier, 'Location']
    Author = df.loc[identifier, 'Author']
    Publisher = df.loc[identifier, 'Publisher']
    Identifierkeyword = df.loc[identifier, 'Identifierkeyword']
    Sourcetype = df.loc[identifier, 'Sourcetype']
    ProQuestdocumentID = str(df.loc[identifier, 'ProQuestdocumentID'])
    Countryofpublication = df.loc[identifier, 'Countryofpublication']
    Languageofpublication = df.loc[identifier, 'Languageofpublication']
    Publicationdate = df.loc[identifier, 'Publicationdate']
    Subject = df.loc[identifier, 'Subject']
    Database = df.loc[identifier, 'Database']
    Documenttype = df.loc[identifier, 'Documenttype']              

#data.update([   
#        ('Title', identifier),
#        ('Publicationtitle', identifier),
#        ('Publicationyear', Publicationyear),
#        ('DocumentURL', DocumentURL),
#        ('Fulltext', Fulltext),
#        ('Links', Links),
#        ('Section', Section),
#        ('Publicationsubject', Publicationsubject),
#        ('ISSN', ISSN),
#        ('Abstract', Abstract),
#        ('Publicationinfo', Publicationinfo),
#        ('Lastupdated', Lastupdated),
#        ('Placeofpublication', Placeofpublication),
#        ('Location', Location),
#        ('Author', Author),
#        ('Publisher', Publisher),
#        ('Identifierkeyword', Identifierkeyword),
#        ('Sourcetype', Sourcetype),
#        ('ProQuestdocumentID', ProQuestdocumentID),
#        ('Countryofpublication', Countryofpublication),
#        ('Languageofpublication', Languageofpublication),
#        ('Publicationdate', Publicationdate),
#        ('Subject', Subject),
#        ('Database', Database),
#        ('Documenttype', identifier),
#    ])

In [19]:
!rm -rf `find -type d -name .ipynb_checkpoints`
!rm -r TRANSFORMED/
!mkdir TRANSFORMED/
!mkdir TRANSFORMED/1950-1959
!mkdir TRANSFORMED/1960-1969
!mkdir TRANSFORMED/1970-1979
!mkdir TRANSFORMED/1980-1989
!mkdir TRANSFORMED/1990-1999
!mkdir TRANSFORMED/2000-2009
!mkdir TRANSFORMED/2010-2019
!mkdir TRANSFORMED/2020-2029

In [21]:
cleansed_folders = ['CLEANSED/1950-1959', 'CLEANSED/1960-1969', 'CLEANSED/1970-1979', 'CLEANSED/1980-1989', 
           'CLEANSED/1990-1999', 'CLEANSED/2000-2009','CLEANSED/2010-2019', 'CLEANSED/2020-2029']
punctuations = ['""','"', "''","``",",",".","'",";",":","[","]","(",")","^", "{","}","=",
                "<",">","!","/","?","+","|","-","_","%","*"] + list(string.punctuation)
# stopwords from nltk
stop_words = list(map(lambda x: x.lower(), set(stopwords.words('english'))))
# https://gist.github.com/anubsinha/e65538585a5630a936a426667a807269
# https://github.com/bahamas10/prepositions/blob/master/prepositions.json
prepositions = ['a', 'abaft', 'aboard', 'about', 'above', 'absent', 'across', 'afore', 'after', 'against', 'along', 
                'alongside', 'amid', 'amidst', 'among', 'amongst', 'an', 'anenst', 'apropos', 'apud', 'around', 'as', 
                'aside', 'astride', 'at', 'athwart', 'atop', 'barring', 'before', 'behind', 'below', 'beneath', 'beside', 
                'besides', 'between', 'beyond', 'but', 'by', 'circa', 'concerning', 'despite', 'down', 'during', 'except', 
                'excluding', 'failing', 'following', 'for', 'forenenst', 'from', 'given', 'in', 'including', 'inside', 
                'into', 'is', 'lest', 'like', 'mid', 'midst', 'minus', 'modulo', 'near', 'next', 'notwithstanding', 'of', 
                'off', 'on', 'onto', 'opposite', 'out', 'outside', 'over', 'pace', 'past', 'per', 'plus', 'pro', 'qua', 
                'regarding', 'round', 'sans', 'save', 'since', 'than', 'the', 'through', 'throughout', 'till', 'times', 
                'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'unto', 'up', 'upon', 'versus', 'via', 
                'vice', 'with', 'within', 'without', 'worth']
# https://gist.github.com/mohataher/837a1ed91aab7ab6c8321a2bae18dc3e
# https://github.com/witch-house/pronoun.is/blob/master/resources/pronouns.tab
pronouns = ['he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'i', 'it', 'its', 'itself', 'mine', 'my', 'myself', 
            'our', 'ours', 'ourselves', 'she', 'their', 'theirs', 'them', 'themself', 'themselves', 'they', 'us', 'we', 
            'you', 'your', 'yours', 'yourself', 'yourselves']
# https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa-no-swears-short.txt
common_words = ['huge', 'fly', 'usgs', 'lies', 'thee', 'smtp', 'tb', 'band', 'bulk', 'mood', 's', 'cork', 'mo', 'stay', 
                'ada', 'had', 'com', 'wear', 'dot', 'cz', 'eur', 'corp', 'fan', 'jp', 'sam', 'fuel', 'dc', 'fast', 
                'was', 'past', 'stem', 'big', 'bios', 'use', 'rf', 'fcc', 'gr', 'fg', 'body', 'sur', 'sell', 'knit', 
                'jobs', 'saw', 'sky', 'alt', 'pg', 'well', 'reid', 'ya', 'ed', 'hull', 'la', 'turn', 'line', 'fits', 
                'yn', 'van', 'pass', 'phil', 'lady', 'ours', 'five', 'bbs', 'dig', 'rosa', 'whom', 'cold', 'tips', 
                'dumb', 'via', 'km', 'rome', 'iraq', 'est', 'ink', 'hub', 'neon', 'ten', 'sm', 'sept', 'the', 'asn', 
                'pas', 'junk', 'weed', 'fire', 'web', 'bomb', 'inch', 'gang', 'buy', 'beer', 'cad', 'pr', 'mv', 'jj', 
                'lows', 'land', 'ryan', 'c', 'fit', 'disk', 'ssl', 'eve', 'ati', 'look', 'cave', 'alto', 'xml', 'cr', 
                'sec', 'coat', 'node', 'flip', 'oh', 'da', 'pro', 'sand', 'h', 'dk', 'laid', 'own', 'z', 'fix', 'prep', 
                'ser', 'wc', 'army', 'arms', 'ef', 'mud', 'st', 'kb', 'boot', 'dpi', 'find', 'l', 'lose', 'wb', 'utc', 
                'is', 'kits', 'ie', 'jill', 'zero', 'cat', 'ta', 'opt', 'faq', 'sen', 'uv', 'th', 'ons', 'rx', 'hist', 
                'eds', 'dept', 'yrs', 'cal', 'rm', 'arm', 'les', 'md', 'oz', 'dirt', 'us', 'bowl', 'mens', 'talk', 'rr', 
                'pays', 'bids', 'lace', 'mars', 'doc', 'kept', 'bass', 'dsc', 'jack', 'dts', 'unto', 'der', 'see', 'file', 
                'once', 'gary', 'sas', 'yea', 'wife', 'ian', 'dame', 'cho', 'toy', 'bmw', 'take', 'gg', 'down', 'ben', 
                'ur', 'poor', 'sims', 'room', 'aug', 'glad', 'ace', 'spin', 'muze', 'beta', 'fans', 'ntsc', 'vip', 'day', 
                'pgp', 'mw', 'pie', 'got', 'kyle', 'mf', 't', 'bt', 'buck', 'ca', 'born', 'age', 'rl', 'todd', 'ana', 
                'href', 'guam', 'jake', 'pal', 'year', 'host', 'font', 'aud', 'feed', 'vg', 'jm', 'nice', 'park', 'jpeg', 
                'li', 'harm', 'vcr', 'visa', 'isa', 'send', 'bd', 'glen', 'exec', 'pads', 'jose', 'jump', 'chip', 'wow', 
                'e', 'very', 'fl', 'tr', 'good', 'mary', 'zone', 'key', 'jr', 'pf', 'ease', 'temp', 'earl', 'q', 'cite', 
                'hold', 'voip', 'jan', 'test', 'polo', 'lg', 'memo', 'dome', 'oval', 'rn', 'be', 'cn', 'dm', 'with', 
                'nose', 'rat', 'eva', 'ave', 'pe', 'flu', 'hope', 'duck', 'aj', 'pet', 'doe', 'af', 'gets', 'tape', 
                'ski', 'pan', 'sofa', 'seem', 'sara', 'pest', 'ia', 'cams', 'oman', 'eos', 'hart', 'tion', 'deny', 
                'gage', 'bond', 'kate', 'adds', 'mb', 'yarn', 'thou', 'echo', 'trap', 'live', 'lord', 'nl', 'amy', 
                'mpg', 'sci', 'dive', 'hey', 'sent', 'rom', 'now', 'week', 'side', 'due', 'do', 'grew', 'per', 'hiv', 
                'del', 'mind', 'kong', 'mods', 'lone', 'il', 'inc', 'des', 'msg', 'gzip', 'aged', 'east', 'tall', 'rap', 
                'ni', 'jews', 'debt', 'biz', 'ste', 'snow', 'wi', 'viii', 'air', 'clay', 'lion', 'die', 'leaf', 'chat', 
                'hose', 'shoe', 'bit', 'dial', 'ray', 'll', 'blah', 'next', 'been', 'apps', 'only', 'utah', 'son', 'what', 
                'uri', 'cia', 'sign', 'reef', 'bag', 'oclc', 'zoom', 'rich', 'ist', 'does', 'logs', 'que', 'nm', 'mpeg', 
                'tech', 'holy', 've', 'duo', 'im', 'hint', 'its', 'liz', 'put', 'mono', 'mall', 'vpn', 'r', 'lat', 'dee', 
                'ld', 'vt', 'nick', 'gpl', 'tmp', 'hawk', 'tabs', 'bin', 'mark', 'firm', 'ghz', 'gst', 'bo', 'para', 
                'drew', 'tv', 'ford', 'api', 'neo', 'egg', 'pour', 'ut', 'epa', 'out', 'each', 'bull', 'cas', 'ss', 'mini', 
                'foot', 'exp', 'jpg', 'spas', 'joe', 'main', 'hans', 'gba', 'nb', 'lip', 'coin', 'tom', 'near', 'v', 're', 
                'tags', 'inn', 'cord', 'wild', 'aa', 'feof', 'alot', 'mid', 'gray', 'i', 'guys', 'abc', 'gsm', 'ne', 'off', 
                'llc', 'warm', 'glow', 'sc', 'over', 'fx', 'rip', 'yo', 'sure', 'docs', 'luke', 'halo', 'four', 'pink', 
                'amp', 'tale', 'head', 'atom', 'pct', 'bp', 'bias', 'such', 'nike', 'tube', 'many', 'seen', 'fail', 'gulf', 
                'abu', 'rp', 'ml', 'pat', 'gas', 'vhs', 'sw', 'hwy', 'html', 'xbox', 'full', 'bi', 'qc', 'grow', 'tea', 
                'cup', 'jim', 'wan', 'fm', 'cnet', 'rope', 'nuke', 'adsl', 'ibm', 'sys', 'aaa', 'jvc', 'stop', 'cups', 
                'sql', 'nz', 'nhs', 'deep', 'neil', 'le', 'evil', 'mls', 'libs', 'ds', 'sale', 'fax', 'obj', 'bent', 
                'arts', 'ky', 'wa', 'come', 'arch', 'eric', 'hd', 'vids', 'rt', 'drum', 'belt', 'oak', 'game', 'tin', 
                'hair', 'dump', 'math', 'duty', 'cuba', 'peru', 'crew', 'ppc', 'mad', 'fig', 'burn', 'den', 'chi', 
                'dem', 'oo', 'laws', 'bali', 'felt', 'busy', 'pi', 'self', 'swap', 'bay', 'une', 'grab', 'ages', 'dow', 
                'wind', 'semi', 'tba', 'au', 'dist', 'nato', 'box', 'nut', 'caps', 'crop', 'nfl', 'om', 'want', 'edit', 
                'az', 'iron', 'tee', 'las', 'dont', 'dude', 'id', 'sony', 'ali', 'wu', 'held', 'rpm', 'comp', 'ips', 
                'two', 'fist', 'lock', 'lift', 'nc', 'jul', 'beds', 'owns', 'home', 'my', 'peak', 'wet', 'rice', 'ruby', 
                'dash', 'en', 'john', 'loop', 'zoo', 'ice', 'bars', 'sega', 'pn', 'case', 'cod', 'ones', 'fred', 'sv', 
                'cos', 'no', 'ct', 'lf', 'san', 'tank', 'blue', 'ad', 'boom', 'lime', 'anti', 'hb', 'usda', 'dust', 'she', 
                'bug', 'lane', 'surf', 'joy', 'urls', 'sub', 'bet', 'ns', 'tu', 'gaps', 'why', 'map', 'marc', 'mix', 
                'keep', 'dvds', 'um', 'gmc', 'cent', 'wv', 'fy', 'wine', 'tied', 'eye', 'mit', 'leg', 'have', 'lack', 
                'gold', 'swim', 'qui', 'bite', 'hire', 'ups', 'pl', 'sink', 'zinc', 'rug', 'eat', 'seat', 'ha', 'ids', 
                'hole', 'aye', 'j', 'upon', 'or', 'bad', 'bare', 'dies', 'vi', 'bee', 'feet', 'cvs', 'pack', 'cash', 'ks', 
                'lean', 'span', 'maps', 'ind', 'gig', 'suse', 'call', 'wed', 'cove', 'poly', 'sip', 'hit', 'soup', 'np', 
                'dec', 'dog', 'fw', 'watt', 'sea', 'tc', 'soma', 'spot', 'mia', 'arc', 'bind', 'am', 'tony', 'bio', 'aus', 
                'ah', 'boys', 'pole', 'toll', 'date', 'cake', 'clip', 'kurt', 'ja', 'pins', 'icq', 'jets', 'tn', 'fear', 
                'col', 'ripe', 'doug', 'eh', 'es', 'chan', 'sa', 'rh', 'spy', 'tm', 'mat', 'dx', 'lake', 'flat', 'dt', 
                'some', 'scan', 'thai', 'hook', 'et', 'lo', 'draw', 'reed', 'bone', 'wage', 'rb', 'fate', 'plot', 'knew', 
                'rc', 'bat', 'cu', 'add', 'ez', 'mins', 'bc', 'dale', 'arg', 'fact', 'when', 'med', 'ny', 'joan', 'lou', 
                'path', 'hs', 'oc', 'atm', 'baby', 'pics', 'leon', 'sr', 'owen', 'sk', 'lcd', 'mic', 'dock', 'lawn', 'wp', 
                'g', 'push', 'ps', 'pci', 'tool', 'flux', 'rico', 'help', 'php', 'rim', 'ugly', 'gays', 'lite', 'sf', 
                'tgp', 'seo', 'foul', 'dos', 'lamp', 'enb', 'tel', 'boat', 'rage', 'nba', 'face', 'nano', 'step', 'tie', 
                'tone', 'yang', 'lung', 'fine', 'rss', 'view', 'blow', 'load', 'crm', 'wire', 'bg', 'vote', 'free', 
                'fuji', 'role', 'jury', 'true', 'bugs', 'disc', 'cnn', 'pac', 'lets', 'bra', 'hits', 'mali', 'show', 
                'neck', 'sail', 'pull', 'mill', 'pt', 'tri', 'gift', 'port', 'mint', 'midi', 'ways', 'same', 'erik', 
                'an', 'oops', 'nh', 'lazy', 'ball', 'acre', 'eg', 'bed', 'ai', 'divx', 'note', 'fc', 'sl', 'fiji', 'jon', 
                'drop', 'arab', 'twin', 'acne', 'type', 'img', 'tf', 'gs', 'spec', 'cw', 'wait', 'und', 'jc', 'nv', 'mice', 
                'loan', 'love', 'team', 'mart', 'dl', 'hp', 'won', 'trip', 'tour', 'bell', 'town', 'told', 'cart', 'carl', 
                'wax', 'lc', 'ware', 'biol', 'me', 'desk', 'pete', 'uw', 'pace', 'pork', 'dad', 'ka', 'sans', 'io', 'heel', 
                'gay', 'ev', 'eau', 'rock', 'base', 'hon', 'psi', 'spam', 'rate', 'zip', 'okay', 'fin', 'misc', 'fund', 
                'suit', 'him', 'miss', 'kim', 'jam', 'kent', 'dev', 'rush', 'mi', 'sat', 'ann', 'df', 'ag', 'mj', 'thu', 
                'fee', 'palm', 'pub', 'mate', 'cost', 'barn', 'star', 'cpu', 'solo', 'pit', 'pale', 'ic', 'pam', 'hack', 
                'cats', 'meta', 'road', 'nova', 'yet', 'byte', 'kai', 'calm', 'used', 'man', 'sao', 'boss', 'hunt', 'lp', 
                'a', 'ide', 'dj', 'pump', 'cool', 'milk', 'edge', 'slow', 'fog', 'nat', 'teen', 'wake', 'soon', 'bold', 
                'form', 'hugh', 'on', 'wall', 'inf', 'fo', 'sue', 'hugo', 'usd', 'acm', 'proc', 'all', 'dan', 'oven', 'ads', 
                'dh', 'bid', 'vice', 'mph', 'pa', 'by', 'mode', 'os', 'beam', 'min', 'made', 'mag', 'say', 'gt', 'race', 
                'wiki', 'exam', 'treo', 'grad', 'rep', 'ltd', 'lt', 'bolt', 'rj', 'fda', 'nt', 'op', 'nest', 'wool', 'text', 
                'gcc', 'shop', 'mae', 'navy', 'cute', 'drug', 'demo', 'incl', 'asin', 'acts', 'ing', 'max', 'mile', 'zen', 
                'sole', 'has', 'blog', 'zus', 'lab', 'fake', 'sim', 'u', 'till', 'cure', 'bk', 'aw', 'hop', 'par', 'pmc', 
                'uni', 'casa', 'ho', 'rg', 'cd', 'bs', 'cube', 'cast', 'poll', 'apr', 'una', 'rule', 'eu', 'ba', 'usps', 
                'n', 'penn', 'kind', 'and', 'pm', 'sir', 'cet', 'dvd', 'gbp', 'bang', 'fall', 'o', 'vs', 'fa', 'tap', 'gps', 
                'sort', 'bob', 'his', 'isbn', 'wise', 'chad', 'dr', 'he', 'ye', 'weak', 'uses', 'mlb', 'bald', 'oral', 'xp', 
                'cop', 'scsi', 'rid', 'rest', 'erp', 'ou', 'di', 'rna', 'page', 'ty', 'rpg', 'plc', 'work', 'pay', 'yeah', 
                'pick', 'mine', 'nsw', 'shot', 'reel', 'not', 'sb', 'lamb', 'cm', 'grey', 'cruz', 'wins', 'zu', 'low', 'yu', 
                'pvc', 'rod', 'doom', 'kid', 'gui', 'shaw', 'tail', 'bon', 'win', 'dead', 'wash', 'josh', 'gene', 'quit', 
                'save', 'gods', 'sie', 'oct', 'reno', 'pty', 'jar', 'dogs', 'ci', 'fwd', 'your', 'hash', 'but', 'keys', 
                'gis', 'tiny', 'hour', 'go', 'term', 'hip', 'we', 'pda', 'tx', 'rows', 'phd', 'fell', 'http', 'kiss', 'iv', 
                'auto', 'buf', 'rand', 'vary', 'tft', 'gif', 'fd', 'in', 'pst', 'soap', 'pin', 'var', 'you', 'bar', 'eval', 
                'hz', 'mail', 'ala', 'ira', 'site', 'folk', 'tim', 'og', 'gd', 'idle', 'sold', 'back', 'kg', 'sg', 'eden', 
                'eyes', 'mom', 'roy', 'rrp', 'pdt', 'ata', 'sum', 'ipaq', 'isle', 'from', 'top', 'mega', 'prot', 'foo', 
                'blvd', 'luck', 'nu', 'cfr', 'may', 'cape', 'worm', 'hl', 'poem', 'maui', 'null', 'at', 'zope', 'make', 
                'quad', 'copy', 'can', 'ga', 'bank', 'gore', 'acer', 'hall', 'nov', 'pd', 'rats', 'know', 'sq', 'need', 
                'pc', 'guy', 'qty', 'ken', 'cafe', 'tide', 'cb', 'mike', 'west', 'ciao', 'len', 'fame', 'aid', 'lisa', 
                'ncaa', 'mh', 'sync', 'dual', 'java', 'nor', 'bus', 'urw', 'wifi', 'p', 'new', 'cms', 'else', 'ill', 
                'thus', 'joke', 'ln', 'her', 'oils', 'mem', 'ccd', 'gc', 'war', 'tons', 'york', 'soc', 'quiz', 'nam', 
                'law', 'gmbh', 'sing', 'ea', 'res', 'm', 'fees', 'word', 'ak', 'tire', 'pmid', 'carb', 'gym', 'moon', 
                'beth', 'ant', 'hear', 'last', 'legs', 'gi', 'lynn', 'wt', 'er', 'ham', 'trim', 'ph', 'met', 'nr', 'info', 
                'lips', 'ping', 'lb', 'logo', 'thru', 'ends', 'ri', 'said', 'like', 'pool', 'jazz', 'dir', 'spa', 'keno', 
                'con', 'tend', 'gmt', 'read', 'item', 'ra', 'vat', 'rent', 'bm', 'vb', 'ws', 'vast', 'ix', 'wave', 'goto', 
                'six', 'adam', 'set', 'rob', 'lay', 'clan', 'cio', 'post', 'tent', 'slip', 'bool', 'user', 'asks', 'rw', 
                'wx', 'duke', 'ext', 'ro', 'xi', 'nn', 'vc', 'str', 'ob', 'dave', 'bags', 'walk', 'ear', 'odd', 'lol', 
                'shed', 'mar', 'cuts', 'pope', 'done', 'se', 'fool', 'lil', 'psp', 'pop', 'qt', 'lan', 'foto', 'mc', 'aka', 
                'sn', 'judy', 'fare', 'sage', 'laos', 'wr', 'ron', 'ol', 'chen', 'aqua', 'vic', 'tear', 'mean', 'de', 'gnu', 
                'pre', 'dsl', 'std', 'size', 'worn', 'iowa', 'wm', 'uk', 'list', 'eyed', 'andy', 'went', 'more', 'pros', 
                'mil', 'than', 'kde', 'sets', 'stat', 'meal', 'ti', 'pic', 'grid', 'lm', 'mass', 'aims', 'babe', 'por', 
                'none', 'troy', 'earn', 'rare', 'dui', 'thy', 'wy', 'july', 'aim', 'src', 'el', 'life', 'chem', 'tan', 
                'ton', 'audi', 'try', 'goal', 'gp', 'play', 'club', 'seed', 'sick', 'frog', 'em', 'yale', 'slim', 'qld', 
                'wide', 'peer', 'leu', 'dns', 'sake', 'ab', 'jet', 'it', 'cry', 'dice', 'ot', 'sol', 'upc', 'asp', 'shut', 
                'wood', 'icon', 'as', 'job', 'bush', 'undo', 'sons', 'bb', 'mhz', 'gave', 'd', 'mere', 'cdt', 'ride', 
                'app', 'red', 'cf', 'hats', 'b', 'mr', 'kay', 'here', 'pair', 'girl', 'norm', 'iran', 'pdf', 'aol', 'pk', 
                'thin', 'grip', 'bath', 'cars', 'cage', 'ict', 'did', 'ban', 'acc', 'dans', 'slot', 'sep', 'cg', 'kit', 
                'give', 'das', 'inns', 'f', 'toys', 'tue', 'seek', 'dark', 'ctrl', 'horn', 'gear', 'gl', 'vol', 'roof', 
                'up', 'org', 'mon', 'dawn', 'doll', 'row', 'usc', 'av', 'jeff', 'je', 'bike', 'wto', 'oem', 'pray', 'los', 
                'til', 'co', 'hq', 'gap', 'soul', 'nasa', 'ward', 'mas', 'walt', 'tvs', 'long', 'funk', 'half', 'pen', 
                'pens', 'hood', 'ment', 'that', 'plan', 'cbs', 'sox', 'pos', 'hh', 'pure', 'cole', 'salt', 'sms', 'asus', 
                'our', 'flow', 'odds', 'ever', 'hu', 'pdas', 'pix', 'pike', 'ash', 'cir', 'mx', 'trio', 'deck', 'lang', 
                'ii', 'book', 'hdtv', 'irs', 'beef', 'yes', 'root', 'tex', 'vid', 'male', 'xl', 'mask', 'data', 'benz', 
                'avi', 'wav', 'yard', 'lots', 'hr', 'eq', 'oil', 'hide', 'css', 'snap', 'bbc', 'rec', 'bras', 'camp', 
                'poet', 'mess', 'best', 'code', 'cds', 'card', 'so', 'are', 'pcs', 'yen', 'ago', 'jay', 'link', 'pj', 
                'king', 'rose', 'deer', 'bill', 'even', 'diy', 'www', 'pad', 'ins', 'nil', 'plug', 'nec', 'old', 'tune', 
                'labs', 'ht', 'anna', 'jean', 'door', 'wars', 'ooo', 'llp', 'cs', 'leo', 'raw', 'pod', 'ru', 'su', 'meat', 
                'pipe', 'soil', 'lbs', 'few', 'du', 'lid', 'onto', 'goat', 'mtv', 'isp', 'act', 'area', 'tt', 'gem', 'toe', 
                'plus', 'ko', 'lost', 'avg', 'if', 'most', 'wrap', 'usa', 'news', 'eggs', 'dry', 'ion', 'pb', 'sh', 'tp', 
                'chef', 'gun', 'ec', 'ap', 'hi', 'ate', 'guns', 'moss', 'boc', 'less', 'int', 'vp', 'rs', 'ship', 'real', 
                'comm', 'rear', 'dat', 'ohio', 'cgi', 'mesh', 'mold', 'ipod', 'heat', 'hero', 'espn', 'von', 'bl', 'rica', 
                'eng', 'fish', 'msn', 'gm', 'sad', 'tray', 'end', 'brad', 'uc', 'art', 'nbc', 'noon', 'ch', 'song', 'hurt', 
                'gb', 'open', 'taxi', 'oecd', 'were', 'dear', 'dom', 'dip', 'axis', 'car', 'cl', 'herb', 'gate', 'sp', 
                'will', 'hill', 'mu', 'rv', 'cv', 'net', 'acid', 'usb', 'pp', 'sri', 'tub', 'bend', 'fri', 'jun', 'gone', 
                'wit', 'kick', 'bear', 'zum', 'runs', 'phi', 'tier', 'ted', 'w', 'golf', 'uh', 'cab', 'lib', 'bw', 'sees', 
                'jail', 'guru', 'ebay', 'dis', 'msie', 'gain', 'meet', 'hung', 'tip', 'able', 'iii', 'pond', 'def', 'avon', 
                'lead', 'nw', 'era', 'urge', 'tile', 'hong', 'hot', 'sean', 'rail', 'ppm', 'mrna', 'prev', 'non', 'epic', 
                'flag', 'get', 'fed', 'took', 'ge', 'hat', 'pose', 'ruth', 'gen', 'karl', 'task', 'ross', 'gras', 'much', 
                'feb', 'ram', 'skin', 'nj', 'part', 'bean', 'seq', 'greg', 'cut', 'val', 'mba', 'let', 'loss', 'rugs', 
                'ring', 'tops', 'mm', 'cst', 'fork', 'rise', 'kw', 'lap', 'fp', 'fat', 'kill', 'drag', 'nuts', 'must', 
                'join', 'mt', 'how', 'nhl', 'paul', 'lie', 'nyc', 'ip', 'loud', 'pets', 'stan', 'jane', 'cant', 'wal', 
                'of', 'roll', 'lu', 'ww', 'bow', 'ui', 'both', 'te', 'also', 'mats', 'tax', 'mac', 'menu', 'hay', 'sic', 
                'euro', 'edt', 'puts', 'nd', 'cam', 'conf', 'sku', 'cult', 'issn', 'fe', 'feat', 'jade', 'fur', 'ft', 
                'pee', 'sig', 'hang', 'move', 'url', 'ar', 'them', 'unix', 'db', 'geo', 'risk', 'raid', 'lee', 'jd', 'ep', 
                'gale', 'too', 'cdna', 'stud', 'rio', 'ts', 'ties', 'irc', 'hand', 'ieee', 'mod', 'ls', 'pine', 'anne', 'mg', 
                'ul', 'easy', 'nav', 'dg', 'pts', 'alex', 'goes', 'cc', 'lens', 'k', 'cj', 'usr', 'gel', 'log', 'ok', 'ftp', 
                'dv', 'exit', 'peas', 'phys', 'mel', 'nine', 'rd', 'dose', 'foam', 'way', 'na', 'diet', 'away', 'rfc', 'fort', 
                'oe', 'late', 'po', 'says', 'dna', 'don', 'jeep', 'film', 'eco', 'dish', 'tell', 'yoga', 'run', 'wn', 'va', 
                'alan', 'cap', 'soa', 'for', 'wolf', 'ng', 'div', 'dp', 'ma', 'paso', 'just', 'cell', 'gdp', 'feel', 'ears', 
                'fi', 'paid', 'pubs', 'hk', 'si', 'silk', 'deaf', 'dd', 'wing', 'luis', 'bits', 'ent', 'td', 'mp', 'mime', 
                'emma', 'pig', 'rack', 'left', 'knee', 'cow', 'nail', 'ac', 'sit', 'rca', 'apt', 'hc', 'ee', 'gtk', 'dean', 
                'sap', 'fbi', 'ce', 'time', 'abs', 'ms', 'sd', 'food', 'kids', 'etc', 'dam', 'corn', 'sin', 'hard', 'led', 
                'died', 'rev', 'skip', 'cook', 'ver', 'rick', 'men', 'beat', 'buzz', 'univ', 'idea', 'kirk', 'faqs', 'dana', 
                'mn', 'za', 'mrs', 'joel', 'intl', 'safe', 'lit', 'mug', 'lot', 'then', 'br', 'buys', 'perl', 'aids', 'flex', 
                'keen', 'ir', 'fr', 'char', 'hate', 'mesa', 'high', 'ask', 'dod', 'tab', 'rays', 'iso', 'ran', 'cons', 
                'expo', 'prix', 'seal', 'core', 'sun', 'fun', 'amd', 'hrs', 'pill', 'trek', 'loc', 'al', 'yr', 'dell', 'who', 
                'fair', 'juan', 'mild', 'any', 'pot', 'tcp', 'dare', 'ceo', 'one', 'dim', 'city', 'tar', 'fill', 'pain', 
                'name', 'into', 'un', 'volt', 'ex', 'matt', 'to', 'vii', 'farm', 'fold', 'js', 'rank', 'rely', 'oaks', 
                'geek', 'seas', 'idol', 'days', 'fu', 'reg', 'levy', 'punk', 'ae', 'coal', 'ddr', 'fs', 'x', 'care', 'lucy', 
                'unit', 'june', 'ff', 'asia', 'came', 'bye', 'boy', 'y', 'myth', 'bird', 'moms', 'this', 'cp', 'wish', 
                'rain', 'wma', 'fox', 'mai', 'diff', 'deal', 'cope', 'they', 'void', 'ref', 'far', 'tree', 'soft', 'tag']
# final list
stop = stop_words + pronouns + prepositions + common_words

for folder in cleansed_folders:
    files = glob.glob(folder+'/*.txt')
    for file_name in files:
        with open(file_name, 'r') as f:
            data = f.read().lower()
            # remove special characters
            data = re.sub(r'[^\x00-\x7f]',r'', data)
            # remove some punctuations
            for punctuation in punctuations:
                data = data.replace(punctuation, "")
            # remove stop words, one word characters, and punctuations
            data = ' '.join([i for i in data.split() if (len(i)>2 or i=='ai') and (i not in stop or i=='ai')])
            # fix spaces
            data = data.strip()
            data = re.sub(r'\s+',r' ', data)
        with open("TRANSFORMED/"+folder.split("/")[1]+"/"+file_name.split("/")[-1], 'w') as file:
            file.write(data)
print("*"*20+"FINISHED"+"*"*20)

********************FINISHED********************


In [22]:
transformed_folders = ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979', 'TRANSFORMED/1980-1989', 
           'TRANSFORMED/1990-1999', 'TRANSFORMED/2000-2009','TRANSFORMED/2010-2019', 'TRANSFORMED/2020-2029']

for corpus_root in transformed_folders:
    # Creating corpus using all the text files in root folder
    corpus = PlaintextCorpusReader(corpus_root, '.*txt')
    # Print all File IDs in corpus based on text file names ###
    text_list = corpus.fileids()
    # Define the file output name
    output_filename = corpus_root+'.jsonl'

    for text in text_list:
        # Create identifier from filename
        if corpus_root in ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979']:
            identifier = text.split("_")[0]
        else:
            identifier = text.split("_")[-1][:-4]

        # Compute unigrams
        unigrams = corpus.words(text)
        unigramCount = convert_strings_to_counts(unigrams)

        # Compute bigrams
        tuple_bigrams = list(nltk.bigrams(unigrams))
        string_bigrams = convert_tuple_bigrams(tuple_bigrams)
        bigramCount = convert_strings_to_counts(string_bigrams)

        # Compute trigrams
        tuple_trigrams = list(nltk.trigrams(unigrams))
        string_trigrams = convert_tuple_trigrams(tuple_trigrams)
        trigramCount = convert_strings_to_counts(string_trigrams)

        # Compute fulltext
        with open(corpus_root+'/'+text, 'r') as file:
            fullText = file.read()

        # Calculate wordCount
        wordCount = 0
        for counts in unigramCount.values():
            wordCount = wordCount + counts

        # Create a dictionary `data` to hold each document's data
        # Including id, wordCount, outputFormat, unigramCount,
        # bigramCount, trigramCount, fullText, etc.
        data = {}

        data.update([
            ('id', identifier),
            ('outputFormat', ['unigram', 'bigram', 'trigram', 'fullText']),
            ('wordCount', wordCount),
            ('fullText', fullText),
            ('unigramCount', unigramCount), 
            ('bigramCount', bigramCount), 
            ('trigramCount', trigramCount)
        ])

        # Add additional metadata if there is a metadata.csv available
    #     df = pd.read_csv(corpus_root+'.csv')
    #     df.set_index('id', inplace=True)
    #     # Update Metadata
    #     update_metadata_from_csv()


        # Write the document to the json file  
        with open(output_filename, 'a') as outfile:
            json.dump(data, outfile)
            outfile.write('\n')

    print(str(len(text_list)) + f' items written to {output_filename}.')

1051 items written to TRANSFORMED/1950-1959.jsonl.
2900 items written to TRANSFORMED/1960-1969.jsonl.
1258 items written to TRANSFORMED/1970-1979.jsonl.
1771 items written to TRANSFORMED/1980-1989.jsonl.
1520 items written to TRANSFORMED/1990-1999.jsonl.
2010 items written to TRANSFORMED/2000-2009.jsonl.
2852 items written to TRANSFORMED/2010-2019.jsonl.
412 items written to TRANSFORMED/2020-2029.jsonl.


## Validation

In [23]:
cleansed_folders = ['CLEANSED/1950-1959', 'CLEANSED/1960-1969', 'CLEANSED/1970-1979', 'CLEANSED/1980-1989', 
           'CLEANSED/1990-1999', 'CLEANSED/2000-2009','CLEANSED/2010-2019', 'CLEANSED/2020-2029']
for folder in cleansed_folders:
    files = glob.glob(folder+'/*.txt')
    print("Folder {} has {} files".format(folder, len(files)))

transformed_folders = ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979', 'TRANSFORMED/1980-1989', 
           'TRANSFORMED/1990-1999', 'TRANSFORMED/2000-2009','TRANSFORMED/2010-2019', 'TRANSFORMED/2020-2029']
for folder in transformed_folders:
    files = glob.glob(folder+'/*.txt')
    print("Folder {} has {} files".format(folder, len(files)))

Folder CLEANSED/1950-1959 has 1051 files
Folder CLEANSED/1960-1969 has 2900 files
Folder CLEANSED/1970-1979 has 1258 files
Folder CLEANSED/1980-1989 has 1771 files
Folder CLEANSED/1990-1999 has 1520 files
Folder CLEANSED/2000-2009 has 2010 files
Folder CLEANSED/2010-2019 has 2852 files
Folder CLEANSED/2020-2029 has 412 files
Folder TRANSFORMED/1950-1959 has 1051 files
Folder TRANSFORMED/1960-1969 has 2900 files
Folder TRANSFORMED/1970-1979 has 1258 files
Folder TRANSFORMED/1980-1989 has 1771 files
Folder TRANSFORMED/1990-1999 has 1520 files
Folder TRANSFORMED/2000-2009 has 2010 files
Folder TRANSFORMED/2010-2019 has 2852 files
Folder TRANSFORMED/2020-2029 has 412 files


In [8]:
1051+2900+1258+1771+1520+2010+2852+412

13774