***Comspacity***

**General**

Imports

In [7]:
import spacy
import json

Set Language and Text

In [8]:
language = "english"
text = "Hello. This is a sample text. You can write in here whatever you want."

Load language model and set weights

In [9]:
if language == "english":
    nlp = spacy.load("en_core_web_sm")

    sentence_length_weight = 2
    average_verbes_weight = 7
    word_length_weight = 12
    average_hard_words_weight = 10

elif language == "german":
    nlp = spacy.load("de_core_news_sm")
    
    sentence_length_weight = 2
    average_verbes_weight = 7
    word_length_weight = 11
    average_hard_words_weight = 0


doc = nlp(text)
print(f"Loaded spacy model for {language}.")
print(doc)

Loaded spacy model for english.
Hello. This is a sample text. You can write in here whatever you want.


**Complexity of text**

Set Variables and open sample.json for word frequency

In [10]:
words = 0
words_length = 0
verbes = 0
hard_words = 0
sentences = 0
with open('./backend/sample.json', 'r') as openfile:
        json_object = json.load(openfile)

Counting variables over whole text

In [11]:
for line in doc.sents:
        for token in line:
                if spacy.explain(token.pos_) != "punctuation" and token.lemma_ != " " and "http" not in token.lemma_:
                        words += 1
                        words_length += len(token)

                        if token.pos_ == "VERB":
                                verbes += 1

                        if token.lower_ in json_object:
                                pass
                        else:
                                hard_words += 1
                                
                if spacy.explain(token.tag_) == "punctuation mark, sentence closer" or spacy.explain(token.tag_) == "sentence-final punctuation mark":
                        sentences += 1
print(f"Number of words: {words}")
print(f"Words length of all words: {words_length}")
print(f"Number of verbes: {verbes}")
print(f"Number of hard_words: {hard_words}")
print(f"Number of sentences: {sentences}")

Number of words: 14
Words length of all words: 54
Number of verbes: 2
Number of hard_words: 0
Number of sentences: 3


calculate variables to get text-complexity.

In [12]:
try:    
    average_word_length = words_length/words
    average_hard_words = hard_words/words
except:
    average_word_length = 0
    average_hard_words = 0
try:
    verbes_per_sentence = verbes/sentences
    words_per_sentence = words/sentences
except:
    verbes_per_sentence = 0
    words_per_sentence = 0

print(f"average word length: {average_word_length}")
print(f"average hard words: {average_hard_words}")
print(f"average verbes per sentence: {verbes_per_sentence}")
print(f"average words per sentence: {words_per_sentence}")

average word length: 3.857142857142857
average hard words: 0.0
average verbes per sentence: 0.6666666666666666
average words per sentence: 4.666666666666667


Get text-complexity.

In [13]:
complexity_of_text = words_per_sentence*sentence_length_weight + verbes_per_sentence*average_verbes_weight + \
                        average_word_length*word_length_weight + average_hard_words*average_hard_words_weight
print(f"This text has a complexity score of {complexity_of_text}")

This text has a complexity score of 60.285714285714285


**Complexity per sentence**

Set Variables and open sample.json for word frequency

In [14]:
words = 0
words_length = 0
verbes = 0
hard_words = 0
sentences_list = []
with open('./backend/sample.json', 'r') as openfile:
        json_object = json.load(openfile)

Counting Variables per sentence

In [15]:
for line in doc.sents:
        sentence = line.text
        for token in line:
                if spacy.explain(token.pos_) != "punctuation" and token.lemma_ != " " and "http" not in token.lemma_:
                        words += 1
                        words_length += len(token)

                        if token.pos_ == "VERB":
                                verbes += 1

                        if token.lower_ in json_object:
                                pass
                        else:
                                hard_words += 1
                    
                if spacy.explain(token.tag_) == "punctuation mark, sentence closer" or spacy.explain(token.tag_) == "sentence-final punctuation mark":
                        x = [sentence, words, words_length, verbes, hard_words]
                        print(f"The sentence: '{sentence}' has {words} words, {verbes} verbes and {hard_words} hardwords. The combined word length is {words_length}.")
                        sentences_list.append(x)
                        # Resetting variables
                        words = 0
                        words_length = 0
                        verbes = 0
                        hard_words = 0

The sentence: 'Hello.' has 1 words, 0 verbes and 0 hardwords. The combined word length is 5.
The sentence: 'This is a sample text.' has 5 words, 0 verbes and 0 hardwords. The combined word length is 17.
The sentence: 'You can write in here whatever you want.' has 8 words, 2 verbes and 0 hardwords. The combined word length is 32.


calculate variables to get text complexity.

In [16]:
for sentence in sentences_list:
    try:    
        average_word_length = sentence[2]/sentence[1]
        average_hard_words = sentence[4]/sentence[1]
    except:
        average_word_length = 0
        average_hard_words = 0
    
    sentence[2] = average_word_length
    sentence[4] = average_hard_words
    print(f"The sentence: '{sentence[0]}' has an average word-length of {sentence[2]} and an average of {sentence[4]} hard words per sentence")


The sentence: 'Hello.' has an average word-length of 5.0 and an average of 0.0 hard words per sentence
The sentence: 'This is a sample text.' has an average word-length of 3.4 and an average of 0.0 hard words per sentence
The sentence: 'You can write in here whatever you want.' has an average word-length of 4.0 and an average of 0.0 hard words per sentence


Get text-complexity

In [17]:
for sentence in sentences_list:
    complexity_of_text = sentence[1]*sentence_length_weight + sentence[3]*average_verbes_weight + \
                        sentence[2]*word_length_weight + sentence[4]*average_hard_words_weight
    print(f"The sentence: '{sentence[0]}' has a text-complexity score of {complexity_of_text}")

The sentence: 'Hello.' has a text-complexity score of 62.0
The sentence: 'This is a sample text.' has a text-complexity score of 50.8
The sentence: 'You can write in here whatever you want.' has a text-complexity score of 78.0
