# NLP Algorithms

## SpaCy
SpaCy is fast and agile. It’s designed to amp up cutting edge NLP by making it practical and accessible. It works with other well-known libraries like Gensim and Scikit Learn. Written in Python and Cython, it’s optimized for performance and allows developers a more natural path to more advanced NLP tasks like named entity recognition.

# Prepareing Bibliography
This is necessary to find the files attached in the Zotero Library.

In [None]:
from pybtex import database

In [None]:
class Library:

    def __init__(self, path, format='bibtex'):
        self.path = path
        self.library = database.parse_file(path, bib_format=format)
        self.entries = []
        for entry in self.library.entries:
            self.entries.append(self.library.entries[entry])

In [None]:
library = Library('/Users/paul/Desktop/FOM_MSc_Thesis.bib')

In [None]:
library.entries[-1]

In [None]:
class Document:
    
    def __init__(self, entry):
        self.entry = entry
        self.title = self.entry.fields['title']
        self.fields = self.entry.fields.keys()
        if 'file' in self.fields:
           self.file = self.entry.fields['file'].split('/Users/paul/Zotero/storage/')[1].split(':application/')[0]
        else:
            self.file = None

    def get_path(self):
        if 'path' in self.fields:
            return self.fields['path']
        else:
            None

In [None]:
documents = []

for entry in library.entries:
    document = Document(entry)
    documents.append(document)

In [None]:
#print tiltles and paths for files in bibtexfile. count documents with filepaht

counter = 0
for document in documents:
    print(document.title, document.file)
    if document.file is not None:
        counter += 1
print(counter)


# Extracting Text from PDFs

In [None]:
from pdfminer.high_level import extract_text
import re


In [None]:
base_path = '/Users/paul/Zotero/storage/'
file_paths = [document.file for document in documents if document.file is not None]
file_paths[:5]

In [None]:
text = extract_text(base_path+file_paths[0])

In [None]:
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s|(\n){2,}',text)
sentences = [sentence.replace('\n',' ') for sentence in sentences if sentence not in [None,'\n','',' ','  ']]
sentences = [sentence for sentence in sentences if not re.match(r'^[^a-zA-Z]*$', sentence)]
for sentence in sentences:
    print(repr(sentence))


# Preprocessing

In [None]:
import spacy
nlp = spacy.load("en_core_web_trf")

In [None]:
class Sentence:
    def __init__(self, sentence, file):
        self.file = file
        self.sentence = sentence 
        self.tokens = None
        self.inventory = None
        self.contains_noun = None
        self.contains_verb = None
        self.contains_cid = None
        self.valid = None

         #corp sentence to beginning based on first alphabtic character
        for i, char in enumerate(self.sentence):
            if char.isalpha():
                self.sentence = self.sentence[i:]
                break
        
        #replace tailing digits on words. those digits are usually footnotes
        self.sentence = re.sub(r'[A-Za-z]\d+\b', '', self.sentence)

    def tokenize(self):
        self.tokens = [(word.text, word.pos_) for word in nlp(self.sentence)]
    
    def count_tokens(self):
        if self.tokens is None:
            self.tokenize()

        inventory = {}
        for _, value in self.tokens:
            inventory[value] = inventory.get(value, 0) + 1
    
        self.inventory = inventory
    
    def check_validity(self):
        if self.inventory is None:
            self.count_tokens()

        word_types = self.inventory.keys()

        if 'NOUN' in word_types:
            self.contains_noun = True
        else:
            self.contains_verb = False

        if 'VERB' in word_types:
            self.contains_verb = True
        else:
            self.contains_verb = False

        if re.match(r'\(cid:\d{1,4}\)', self.sentence):
            self.contains_cid = True
        
        if self.contains_noun and self.contains_verb:
            self.valid = True
        else:
            self.valid = False
    
    def summarize(self, show_token_details=False):
        print(f'The origin file is: {self.file}')
        print(f'The sentence is:\n{self.sentence}')
        print(f'The inventory holds:\n{self.inventory}')
        if show_token_details:
            print(f'The token details are:\n{self.tokens}')

         
        

In [None]:
valid_sentences = []

for sen in sentences:
    Sen = Sentence(sen, file)
    Sen.check_validity()
    if Sen.valid:
        valid_sentences.append(Sen)

In [None]:
for s in valid_sentences:
    print(s.sentence)

In [None]:
valid_sentences[157].summarize(True)


## Tokenization

## Lemmatization

## Stemming

# Bag of Words

# Topic Modeling with LDA

# Topic Clustering