# Preparation

In [63]:
# needs console command: python -m spacy download en
import spacy
import nltk
import numpy as np
nlp = spacy.load('en', disable=['ner', 'textcat', 'tagger'])

In [64]:
def parse(textlist):
    # takes a LIST of strings, each string being one passage/text
    # returns list of dependency tags, one for each string
    deps = []
    for doc in nlp.pipe(textlist, batch_size=50, n_threads=3):
        if doc.is_parsed:
            deps.append(tuple([n.dep_ for n in doc]))
        else:
            deps.append(None)
    return deps

In [65]:
def counts(textlist):
    # takes list of texts
    # returns feature vectors (one for each text)
    deps = parse(textlist)
    features = []
    
    for idx in range(len(textlist)):
        counts = []
        n = deps[idx].count('ROOT')
        # 01: Clauses / Subordination
        counts.append(deps[idx].count('acl') + deps[idx].count('advcl') + deps[idx].count('relcl'))
        # 02: Complements
        counts.append(deps[idx].count('ccomp') + deps[idx].count('xcomp'))
        # 03: Coordination
        counts.append(deps[idx].count('cc'))
        # 04: Apposition
        counts.append(deps[idx].count('appos'))
        # 05: Passive Verbs
        counts.append(deps[idx].count('nsubjpass') + deps[idx].count('csubjpass'))
        # 06: Parataxis
        counts.append(deps[idx].count('parataxis'))
        # 07: Auxiliary Verbs
        counts.append(deps[idx].count('aux') + deps[idx].count('auxpass'))
        # 08: Negation
        counts.append(deps[idx].count('neg'))
        # 09: Prepositional Phrases
        counts.append(deps[idx].count('prep'))
        # 10: Modifiers
        counts.append(deps[idx].count('advmod') + deps[idx].count('amod') + deps[idx].count('nummod') + deps[idx].count('nmod'))
        
        features.append(np.array(counts)/n)
    
    return features

# Import LSAT texts to test

lsat = open('data/LSATtexts.txt', 'r')
lsat = lsat.read()
lsat = lsat.split('\n\n')
for text in lsat:
    if text[0] == '#':
        lsat.remove(text)

vec = counts(lsat)

ex = '''There are many big and small libraries everywhere in our country. They have millions of books in different languages. You can find there the oldest and the newest books. 
Every school has a library. Pupils come to the library to take books on different subjects. 
The school library where Oleg studies is good. It is a large clean room. There are four big windows in it. The walls are light blue. There are a lot of shelves full of books. You can find books on literature, physics, history, chemistry, geography, biology and other subjects. There are books in English, too. 
On the walls you can see pictures of some great writers and poets. 
On the table near the window you can always see beautiful spring and autumn flowers. 
Oleg likes to go to the library. He can always find there something new, something he needs. 

Summer is over and it is autumn again, beautiful as ever. Even if you are no artist at all you can see its beauty. It is a season when the trees are simply fantastic â€” yellow, red, green and brown, not just one brown, but browns of all possible shades: light brown, dark brown, yellowish brown and all of a richness that only an artist can see and describe. 
Victor is back in Vorontsovo. He has just come but his thoughts are still in Kiev where the autumn is so beautiful. 
This is not his first visit there. He has already been to Kiev and he has learnt its streets, roads, parks, theatres, cinemas and old and new beautiful buildings. He easily recognizes the streets, buildings, buses, parks and the noise. Noise is everywhere. 
Now he is with his classmates and the usual talk begins. 
"Hallo, Victor!" 
"Hallo, Pete." 
"I am very glad to see you again. How is everything?" 
"Thank you, fine." 
"Now tell me, where have you been all the time? I haven't seen you for ages and you haven't written a word. Did you go anywhere?" 
"Certainly, I did. I have just come back from Kiev." 
"How did you like it? Is it a good place to go to?" 
"Splendid! You must go there some day, too." 
"I certainly shall. And I shall write letters to you as I know you like to get letters." 

This is our classroom. It is light, clean and large. The room is nice. Its ceiling and walls are white, its floor is brown. There is one door and three windows in it. When it is warm, they are open. When it is cold, they are shut. The door is always shut when we have our lessons. 
There is a blackboard on the wall. We write on it. On the blackboard there are some words. They are English words. We read them: "We want to know English." 
We sit on chairs in front of desks. The desks are nice and green. 
The teacher's desk in near the blackboard. There are not many pupils in our class. There are only seventeen in it. Today fifteen pupils are present, two are absent. 
We learn many subjects at school. They are: Russian, English, history, literature, mathematics, physics, chemistry, biology, geography and physical training (or PT). 

The big clock on the tower of the Palace of Westminster in London is often called Big Ben. But Big Ben is really the bell of the clock. It is the biggest clock bell in Britain. It weighs 13.5 tons. 
The clock tower is 318 feet high. You have to go up 374 steps to reach the top. So the clock looks small from the pavement below the tower. 
But its face is 23 feet wide. It would only just fit into some classrooms. 
The minute-hand is 14 feet long.'''

# https://lingualeo.com/pt/jungle/51-easy-reading-texts-for-beginners-elementary-level-418387#/page/1

ex = ex.replace('\n', '')
ex = ex.replace('  ', ' ')
ex = ex.replace('  ', ' ')
vec_simple = counts([ex])

np.mean(vec, axis=0)

vec_simple