## feature extraction 

In [2]:
import spacy
import numpy as np
import syllapy
from re import search
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

In [3]:
with open('../data/Gutenberg/txt/Oscar Wilde___The Picture of Dorian Gray.txt', 'r') as file:
    data = file.read().replace('\n', ' ')

In [8]:
test = "hello i like the the the the large boulder jumps."

In [170]:
%time
nlp = spacy.load("en_core_web_sm")
doc = nlp(test)

cleaned_book = []
number_sentences = len(list(doc.sents))
for sent in doc.sents:
    out = [word.lemma_ for word in sent if word.pos_ in ('VERB', 'NOUN', "VERB", 'ADJ', 'ADV')]
    
for chunk in doc.noun_chunks:
    print(chunk)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.72 µs
i
the large boulder


In [171]:
out

['like', 'large', 'boulder', 'jump']

[like, large, boulder, jump]

In [165]:

# %time a, b = new(data)

In [128]:
 b

6387

In [127]:
%time c, d = stop_content_lemma_sents(data)

CPU times: user 29.5 s, sys: 2.48 s, total: 32 s
Wall time: 32.3 s


In [5]:
def remove_stopwords(raw_book):
    stopwords_dict = Counter(STOP_WORDS)
    return ' '.join([word for word in raw_book.split() if word not in stopwords_dict])

In [10]:
def split_into_senteces(raw_book):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(raw_book)
    number_sentences = len(list(doc.sents))
    return doc, number_sentences

In [166]:
def lematization(doc):
    cleaned_book = []
    for sent in doc.sents:
        out = [word.lemma_ for word in sent if word.pos_ in ('VERB', 'NOUN', "VERB", 'ADJ', 'ADV')]     
        if len(out) >= 1:
            cleaned_book.append(out)
        else:
            pass
    return cleaned_book, number_sentences

In [97]:
def stop_content_lemma_sents(raw_book):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(raw_book)

    cleaned_book = []
    number_sentences = len(list(doc.sents))
    for sent in doc.sents:
        cleaned_sentence = []
        
        sent = [word.lemma_ for word in sent if word.pos_ in ('VERB', 'NOUN', "VERB", 'ADJ', 'ADV')]
        for token in sent:
            if token.is_stop:
                pass
            else:
                if token.pos_ == 'VERB' or token.pos_ == 'NOUN' or token.pos_ == 'VERB' or token.pos_ == 'ADJ' or token.pos_ == 'ADV':
                    cleaned_sentence.append(token.lemma_)
                    
        if len(cleaned_sentence) >= 1:
            cleaned_book.append(cleaned_sentence)
        else:
            pass
    return cleaned_book, number_sentences

In [11]:
cleaned_data, num_sentences = stop_content_lemma_sents("the quick brown fox fox fox fox fox jumps over the lazy dog. lots of small chickens were playing in the field.")

In [13]:
def average_word_length(book):
    return np.average([len(word) for word in book])

In [14]:
average_word_len = np.average([average_word_length(sent) for sent in cleaned_data])

In [15]:
def log_ttr(book):
    '''
    log type token ration accounts for differences in sequence lengths
    a value of 1 means all words are unique 
    a value of 0 means there are no unique words 
    
    expects: an list of sentences
    returns: log(ttr) --> log(token)/log(type)
    '''
    words = {}
    total = 0
    for sent in book:
        for word in sent:
            if word in words:
                words[word] += 1
                total +=1
            else:
                words[word] = 1
                total +=1
    return np.log(len(words))/np.log(total)   

In [30]:
def syllables(cleaned_data):
    '''
    Counts the sylablys per word 
    expects: a list of sentences
    
    returns: a list of sentences where each word has been replaced by its sylabble count 
    '''
    book_syll = []
    for sent in cleaned_data:
        sent_syll = []
        for word in sent:
            sent_syll.append(syllapy.count(word))
        book_syll.append(sent_syll)
    return book_syll

In [31]:
syllables(cleaned_data)

[[1, 1, 1, 1, 1, 1, 1, 1, 2, 1], [1, 1, 2, 1, 1]]

In [108]:
log_ttr(cleaned_data)

0.8854692840710255

In [78]:
phenome_scores = {"a": 10, "e": 9, "o": 9,
                  "i": 8, "u": 8,"j": 8,"w": 8,
                  "r": 7,
                  "l": 6,
                  "m": 5,"n": 5,"ng": 5,
                  "z": 4,"v": 4,
                  "f": 3,"th": 3,"s": 3,
                  "b": 2,"d": 2,"g": 2,
                    "p": 1,"t": 1,"q": 1,"x": 1,"z": 1,"y": 1,"c":1,"k":1,"h":1}

In [80]:
# sonority
def phenomes(cleaned_data):
    '''
    Assigns an score to each word in a sentence based on english phenome heirarchy.
    Expects: list of sentences 
    returns: list of sentences where each word has been replaced by its phenome score 
    '''
    book_score = []
    for sent in cleaned_data:
        sent_score = []
        for word in sent:
            word_score = 0
            skip = []
            
            if search('th', word):
                word_score += phenome_scores['th']
                skip.append(search('th', word).span()[0])
                skip.append(search('th', word).span()[1]-1)
                
            if search('ng', word):
                word_score += phenome_scores['ng']
                skip.append(search('ng', word).span()[0])
                skip.append(search('ng', word).span()[1]-1)
            
            for i,l in enumerate(word):
                if i in skip:
                    pass
                if l in phenome_scores:
                    word_score += phenome_scores[l]
                else:
                    pass
                    
            sent_score.append(word_score)
            
        book_score.append(sent_score)
    return book_score

In [81]:
phenomes(cleaned_data)

[[19, 31, 13, 13, 13, 13, 13, 22, 18, 13], [16, 30, 26, 18, 28]]

In [130]:
# cleaned_book, sent_counts = stop_content_lemma_sents(data)
average_word_len = np.average([average_word_length(sent) for sent in a])
average_syllables = np.average([np.average(sent) for sent in syllables(a)])
average_phenomes = np.average([np.average(sent) for sent in phenomes(a)])
ttr = log_ttr(a)

In [131]:
sent_counts, average_word_len, average_syllables, average_phenomes, ttr

(6387,
 5.505289932296925,
 1.6776957154641305,
 30.81638851033225,
 0.8594171074659054)

In [132]:
# cleaned_book, sent_counts = stop_content_lemma_sents(data)
average_word_len = np.average([average_word_length(sent) for sent in c])
average_syllables = np.average([np.average(sent) for sent in syllables(c)])
average_phenomes = np.average([np.average(sent) for sent in phenomes(c)])
ttr = log_ttr(c)

In [133]:
sent_counts, average_word_len, average_syllables, average_phenomes, ttr

(6387,
 5.945797565856851,
 1.7866634875609775,
 33.062045340593585,
 0.8742969948730204)

In [99]:
cleaned_book, sent_counts = stop_content_lemma_sents(data)

In [102]:
average_word_len = np.average([average_word_length(sent) for sent in cleaned_book])

In [103]:
average_word_len

5.945797565856851