## Database creation to generate MCQ exercises

In [1]:
import stanza
import spacy
import pandas as pd
import spacy_stanza
import os
import warnings
# import mlconjug3 
import mlconjug3
warnings.filterwarnings("ignore")

In [2]:
#import excel corpus
df = pd.read_excel("7000 sentences Corpus with IDs.xlsx")

In [3]:
#get path 
os.getcwd()

'C:\\Users\\Administrator\\stanza_models'

In [4]:
#stanza.download(lang='en', model_dir ='./stanza_models')
nlp_en = spacy_stanza.load_pipeline("en", dir = 'C:\\Users\\Administrator\\stanza_models')

2021-12-10 00:01:30 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-12-10 00:01:30 INFO: Use device: cpu
2021-12-10 00:01:30 INFO: Loading: tokenize
2021-12-10 00:01:30 INFO: Loading: pos
2021-12-10 00:01:32 INFO: Loading: lemma
2021-12-10 00:01:33 INFO: Loading: depparse
2021-12-10 00:01:35 INFO: Loading: sentiment
2021-12-10 00:01:37 INFO: Loading: ner
2021-12-10 00:01:41 INFO: Done loading processors!


In [5]:
df_eng = df["English"].dropna()

In [13]:
list_df_noun = []
list_df_verb = []

for phrase in df_eng:

    list_noun = []
    list_verb = []
    doc = nlp_en(phrase)
    for token in doc:
        # Get noun and number
        if token.pos_ == 'NOUN' and len(token.morph.get("Number")) > 0:
            list_noun.append({token.text: token.morph.get("Number")[0]})
        
        # Get verb
        if token.pos_ == 'VERB' and len(token.morph.get("Tense")) > 0:
            prefix = ''
            if len(token.morph.get("Voice")) > 0 and token.morph.get("Voice")[0] == "Pass":
                temp_token = token
                while temp_token.nbor(-1).pos_ == "AUX":
                    prefix = temp_token.nbor(-1).text + ' ' + prefix
                    temp_token = temp_token.nbor(-1)
                    
            list_verb.append({prefix + token.text: token.morph.get("Tense")[0]})
    
    # Add noun
    for noun in list_noun:
        for key, value in noun.items():
            list_df_noun.append([phrase, key, value])
            
    # Add verb
    for verb in list_verb:
        for key, value in verb.items():
            list_df_verb.append([phrase, key, value])

In [27]:
df_noun = pd.DataFrame(list_df_noun, columns=["Phrase", "Noun", "Number"])
df_noun

Unnamed: 0,Phrase,Noun,Number
0,The beauty of the landscape struck the travell...,beauty,Sing
1,The beauty of the landscape struck the travell...,landscape,Sing
2,The beauty of the landscape struck the travell...,travellers,Plur
3,Nobody knows the truth about this affair.,truth,Sing
4,Nobody knows the truth about this affair.,affair,Sing
...,...,...,...
5734,Computer scientists find a job quickly enough.,Computer,Sing
5735,Computer scientists find a job quickly enough.,scientists,Plur
5736,Computer scientists find a job quickly enough.,job,Sing
5737,Shoemakers rapair shoes.,Shoemakers,Plur


In [28]:
#since getting verbs take time, we convert it to csv
df_noun.to_csv("df_noun.csv",index=False)

In [8]:
df_noun = pd.read_csv("df_noun.csv")

In [22]:

df_verb = pd.DataFrame(list_df_verb, columns=["Phrase", "Verb", "Tense"])
df_verb

Unnamed: 0,Phrase,Verb,Tense
0,The beauty of the landscape struck the travell...,struck,Past
1,Nobody knows the truth about this affair.,knows,Pres
2,"In a dictatorship, freedom of expression is li...",is limited,Past
3,His wickedness had no limits.,had,Past
4,His elegance impressed the assembly.,impressed,Past
...,...,...,...
2215,Teachers teach in primary schools.,teach,Pres
2216,The plumber is going to come this afternoon.,going,Pres
2217,He quit his job because his salary was too low.,quit,Past
2218,Computer scientists find a job quickly enough.,find,Pres


In [31]:
# getting the tense and verb takes time, so convert it to csv
df_verb.to_csv('df_verb.csv',index=False)

In [10]:
df_verb = pd.read_csv('df_verb.csv')

In [8]:
#create function to lemmatize the verb 
def get_lemma(x):
    doc=nlp_en(x)
    for token in doc:
        return token.lemma_
    


In [9]:
#test
get_lemma("struck")

'strike'

In [11]:
#apply function to verb column to obtain lemma
df_verb["Lemma"]=df_verb["Verb"].apply(lambda x:get_lemma(x))

In [12]:
#lemma function application takes time as well, so we conver it as csv a well
df_verb.to_csv('df_verb.csv',index=False)

In [11]:
df_verb = pd.read_csv('df_verb.csv')

In [12]:
df_verb

Unnamed: 0,Phrase,Verb,Tense,Lemma
0,The beauty of the landscape struck the travell...,struck,Past,strike
1,Nobody knows the truth about this affair.,knows,Pres,know
2,"In a dictatorship, freedom of expression is li...",is limited,Past,be
3,His wickedness had no limits.,had,Past,have
4,His elegance impressed the assembly.,impressed,Past,impressed
...,...,...,...,...
2215,Teachers teach in primary schools.,teach,Pres,teach
2216,The plumber is going to come this afternoon.,going,Pres,go
2217,He quit his job because his salary was too low.,quit,Past,quit
2218,Computer scientists find a job quickly enough.,find,Pres,find


In [14]:
# choose language to set as default conjugator
default_conjugator = mlconjug3.Conjugator(language='en')


In [15]:
#test instantiated function on verb "know"
test_verb = default_conjugator.conjugate("know")
all_conjugated_forms = test_verb.iterate()
#results gives a list of tuples
print(all_conjugated_forms)

[('indicative', 'indicative present', '1s', 'know'), ('indicative', 'indicative present', '2s', 'know'), ('indicative', 'indicative present', '3s', 'knows'), ('indicative', 'indicative present', '1p', 'know'), ('indicative', 'indicative present', '2p', 'know'), ('indicative', 'indicative present', '3p', 'know'), ('indicative', 'indicative past tense', '1s', 'knew'), ('indicative', 'indicative past tense', '2s', 'knew'), ('indicative', 'indicative past tense', '3s', 'knew'), ('indicative', 'indicative past tense', '1p', 'knew'), ('indicative', 'indicative past tense', '2p', 'knew'), ('indicative', 'indicative past tense', '3p', 'knew'), ('indicative', 'indicative present continuous', '1s', 'knowing'), ('indicative', 'indicative present continuous', '2s', 'knowing'), ('indicative', 'indicative present continuous', '3s', 'knowing'), ('indicative', 'indicative present continuous', '1p', 'knowing'), ('indicative', 'indicative present continuous', '2p', 'knowing'), ('indicative', 'indicative

In [18]:
#code to loop over a list of tuples and access individual elements of each tuple in the list
for index, tuple in enumerate(all_conjugated_forms[:2]):
    #most tuples are of length 4 except 1 so we verify the conditions to not have index out of range
    if len(tuple)==4:
        print(tuple[0])
        print(tuple[1])
        print(tuple[2])
        print(tuple[3])
        print("-------")

indicative
indicative present
1s
know
-------
indicative
indicative present
2s
know
-------


We want to conjugate the lemmatized words in a single function, so to do so we will need to return multiple values
> A clean way to do it is using pandas' and apply function, but the difference , we will need to output more than one column.

> Below is an example 

In [92]:
#https://stackoverflow.com/questions/23586510/return-multiple-columns-from-pandas-apply

In [16]:
#define the language of the conjugator
default_conjugator = mlconjug3.Conjugator(language='en')
"""
The function takes parameter x, in normal pandas'- apply operation where only one output is returned, the input is a variable
But in this case, we don't use the lambda function , so the input is a series
To get individual values instead of series ,we extract the values from the series i.e x = x[0]
We then use conjugate our lemmatize verb and we iterate over the different possibilities using mlconjug3's function iterate()
After that, we iterate over the list of tuples and access the elements of each tuple to correspond to our needs

For example, a tuple has the format ('indicative', 'indicative present', '1s', 'know')

The length is 4:

2nd element(1st index) : verb form
3rd element (2nd index): mode:   1s = 1st person singular
                                 2s = 2nd person singular
                                 1p = 1st person plural
                                 2p = 2nd person plural
                                 
4th element : conjugated verb

Note :  we can return more types of verbs according to the needs of the project

"""
def congugate_lemma_verbs(x):
    x=x[0]
    test_verb = default_conjugator.conjugate(x)
    all_conjugated_forms = test_verb.iterate()
    
    for index, tuple in enumerate(all_conjugated_forms):
        if len(tuple)==4:
            if tuple[1] == "indicative present" and tuple[2] =="1s":
                conjugated_verb_1 = tuple[3]
                
            if tuple[1] == "indicative present" and tuple[2] =="1p":
                conjugated_verb_11 = tuple[3]
            
            if tuple[1] =="indicative past tense" and tuple[2]=="1s":
                conjugated_verb_2 = tuple[3]
        
            if tuple[1] =="indicative past tense" and tuple[2]=="1p":
                conjugated_verb_22 = tuple[3]
                
            if tuple[1] =="indicative present continuous" and tuple[2] =="1s":
                conjugated_verb_3 = tuple[3]
                
            if tuple[1] =="indicative present perfect" and tuple[2]=="1s":
                conjugated_verb_4 = tuple[3]
 
            if tuple[1] =="imperative present" and tuple[2] =="2s":
                conjugated_verb_5 = tuple[3]
        
    return conjugated_verb_1,conjugated_verb_11,conjugated_verb_2,conjugated_verb_22,conjugated_verb_3,conjugated_verb_4,conjugated_verb_5

In [23]:
#apply function by creating 6 new columns
df_verb[["FP singular indicative present","FP plural indicative present","FP singular indicative past",\
         "FP plural indicative past","FP indicative present continuous",\
         "FP indicative present perfect","SP imperative present"]]=\
df_verb[["Lemma"]].apply(congugate_lemma_verbs,axis=1, result_type="expand")

In [19]:
df_verb.head()

Unnamed: 0,Phrase,Verb,Tense,Lemma,FP singular indicative present,FP plural indicative present,FP singular indicative past,FP plural indicative past,FP indicative present continuous,FP indicative present perfect,SP imperative present
0,The beauty of the landscape struck the travell...,struck,Past,strike,strike,strike,struck,struck,striking,struck/stricken,strike
1,Nobody knows the truth about this affair.,knows,Pres,know,know,know,knew,knew,knowing,known,know
2,"In a dictatorship, freedom of expression is li...",is limited,Past,be,am,are,was,were,being,been,be
3,His wickedness had no limits.,had,Past,have,have,have,had,had,having,had,have
4,His elegance impressed the assembly.,impressed,Past,impressed,impressee,impressee,impresseed,impresseed,impresseing,impresseed,impressee
