# virtueller Requirements Engineer Experiment
https://parts-of-speech.info

In [1]:
import os

In [2]:
data_dir = "/Users/vs21/hicss23/Data/"
data_files = []

for file_name in os.listdir(data_dir):
    if ".txt" in file_name:
        data_files.append(data_dir + file_name)

data_files


['/Users/vs21/hicss23/Data/akita_3.txt',
 '/Users/vs21/hicss23/Data/akita_2.txt',
 '/Users/vs21/hicss23/Data/akita_1.txt',
 '/Users/vs21/hicss23/Data/akita_4.txt']

In [3]:
data_content = []
for data_file in data_files:
    with open (data_file, 'r') as file:
        data_content.append(file.read().replace('\n', '').replace('.', ' ').replace(',', ' '))

In [4]:
data_content[2]

'The Akita (秋田  Akita  Japanese pronunciation: [akʲita]) is a Japanese dog breed of large size  Originating from the mountains of northern Japan  the Akita has a short double coat similar to that of many other northern spitz breeds  Historically  they were used by samurai for guarding  fighting and the hunting of bears The Akita is a powerful  independent  and dominant breed  commonly aloof with strangers  but affectionate and deeply loyal with its family  As a breed  Akitas are generally hardy  The two separate varieties of Akita are a pure Japanese strain  called Akita Inu or Akita-ken  and a larger mixed strain  commonly referred to as the "American Akita" [2] However  it is subject to debate as to whether the Akita strains are distinct  or if they constitute one  single breed [3][4][5]Debate exists among fanciers whether these are two separate breeds of Akitas  As of 2020  the American Kennel Club [6] now considers American and Japanese Akitas to be two separate breeds  no longer a

### stanford core nlp
method receives a sentence and returns : Normal Text, lemmatized text, POS.

In [5]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse') # This sets up a default neural pipeline in English


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-05-27 13:28:45 INFO: Downloading default packages for language: en (English) ...
2023-05-27 13:28:46 INFO: File exists: /Users/vs21/stanza_resources/en/default.zip
2023-05-27 13:28:49 INFO: Finished downloading models and saved to /Users/vs21/stanza_resources.
2023-05-27 13:28:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-05-27 13:28:50 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2023-05-27 13:28:50 INFO: Use device: cpu
2023-05-27 13:28:50 INFO: Loading: tokenize
2023-05-27 13:28:50 INFO: Loading: pos
2023-05-27 13:28:50 INFO: Loading: lemma
2023-05-27 13:28:50 INFO: Loading: depparse
2023-05-27 13:28:50 INFO: Done loading processors!


In [6]:
test_sentence = "Documents contained the Name of the consultant."
def core_nlp(sentence):
    # method receives a sentence and returns : Normal Text, lemmatized text, POS.
    result_list = []
    doc = nlp(sentence)
    for sentence in doc.sentences:
        for word in sentence.words:
            result_list.append([word.text, word.lemma, word.pos, word.deprel, word.head])
            # deprel doku : https://stanfordnlp.github.io/stanza/depparse.html
    return result_list


In [7]:
nlp_results = []
for content in data_content:
    nlp_results.append(core_nlp(content))

len(nlp_results)

4

### get all nouns (potential candidates for classes)
extracted from the original text

In [8]:
def get_all_nouns(nlp_results):
    nouns = []
    for word in nlp_results:
        normal_word = word[0]
        lemma_word = word[1]
        pos_word = word[2]
        deprel_word = word[3]
        if pos_word == "NOUN":
            nouns.append(lemma_word)
    return list(set(nouns))

In [9]:
data_nouns = []
for result in nlp_results:
    data_nouns.append(get_all_nouns(result))

len(data_nouns)

4

In [10]:
len(data_nouns[3])

114

In [12]:
data_nouns[3]

['damage',
 'osteosarcoma',
 'exercise',
 'blood',
 'ceremony',
 'hip',
 'statue',
 'today',
 'stranger',
 'entropion',
 'anemia',
 'popularity',
 'year',
 'CHD',
 'hypoplasia',
 'game',
 'train',
 'chance',
 'water',
 'leash',
 'span',
 'pet',
 'praminor',
 'akc',
 'pemphigus',
 'loyalty',
 'hand',
 'healthmajor',
 'type',
 'coat',
 'epilepsy',
 'undercoat',
 'test',
 'torsion',
 'pattern',
 'master',
 'thyroidLife',
 'drinker',
 'size',
 'bone',
 'concern',
 'household',
 'weather',
 'mask',
 'guard',
 'area',
 'monument',
 'dysplasia',
 'century',
 'akita',
 'eye',
 'training',
 'snow',
 'member',
 'gait',
 'pinto',
 'effort',
 'tomb',
 'job',
 'cell',
 'result',
 'police',
 'day',
 'combination',
 'week',
 'hypothyroidismoccasionally',
 'patellar',
 'companion',
 'house',
 'ligament',
 'pra',
 'breeder',
 'homage',
 'protector',
 'temperament',
 'substance',
 'vkh',
 'station',
 'elbow',
 'work',
 'cruciate',
 'hair',
 'ancestor',
 'syndrome',
 'rupture',
 'admirer',
 'hiking',
 'r

# One hot encoding

In [13]:
all_nouns = [elem for sublist in data_nouns for elem in sublist]
vocabulary = list(set([elem for sublist in data_nouns for elem in sublist]))
len(vocabulary)

540

In [14]:
# mutual nouns:

len(all_nouns) - len(list(set(vocabulary)))


189

In [15]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def integer_encoding(vocab):
    # function that takes a list of strings + the entire vocabulary and outputs a one hot encoding
    values = array(vocab)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    return {values[i]:integer_encoded[i] for i, val in enumerate(vocab)}



In [16]:
encoding = integer_encoding(vocabulary)

In [17]:
len(encoding)

540

In [18]:
def create_one_hot_vector(input_data, encoding):
    # create empty vector
    collect = [0 for i in range(len(encoding))]
    for word in input_data:
        collect[encoding[word]] = 1
    return collect

In [19]:
input_data = ["toy", "toy"]
# input_data = data_nouns[3]

test_function = create_one_hot_vector(input_data, encoding)
test_function.count(1)
# test_function.count(0)

1

# Distance between two one hot vectors

In [20]:
import numpy as np
from numpy.linalg import norm

X = np.array(create_one_hot_vector(["evening", "mask"], encoding))
Y = np.array(create_one_hot_vector(["toy", "homage"], encoding))

cosine = np.dot(X,Y)/(norm(X)*norm(Y))
cosine

0.0

# Requirement Engineer

In [37]:
class r_engineer:
    def __init__(self, nouns):
        self.nouns = nouns
    
    def get_nouns(self):
        return self.nouns

    def get_mutual_nouns(self, customer_nouns):
        return list(set(customer_nouns).intersection(self.nouns))


In [40]:
r_eng = r_engineer(data_nouns[2])

# r_eng.get_nouns()
mutual_nouns = r_eng.get_mutual_nouns(data_nouns[1])
len(mutual_nouns)

41

# Customer

In [100]:
class customer:
    def __init__(self, nouns):
        self.nouns = nouns
    
    def get_nouns(self):
        return self.nouns
    
    def create_final_nouns(self, mutual_nouns):
        final_nouns = self.nouns + mutual_nouns
        self.final_nouns = list(set(final_nouns))

        return self.final_nouns
    
    def append_to_final_nouns(self, r_eng_nouns, cooperation_factor):
        self.final_nouns = self.final_nouns + r_eng_nouns[:int(len(r_eng_nouns) * cooperation_factor)]
        return self.final_nouns


In [102]:
customer_1 = customer(data_nouns[0])
customer_2 = customer(data_nouns[1])
customer_3 = customer(data_nouns[3])

print(customer_1.nouns)
print(customer_2.nouns)
print(customer_3.nouns)

len(customer_1.nouns)

final_nouns = customer_1.create_final_nouns(["bmw"])
print(len(final_nouns))

final_nouns = customer_1.append_to_final_nouns(["test" + str(i) for i in range(10)], 1)
print(len(final_nouns))

['puppy', 'owner', 'visit', 'chow', 'Exercise', 'classification', 'statue', 'today', 'stranger', 'appearance', 'section', 'drool', 'popularity', 'year', 'patch', 'game', 'requirement', 'averageenergy', 'government', 'pound', 'color', 'mediumcharacteristics', 'weight', 'akc', 'show', 'loyalty', 'muzzle', 'triangle', 'coat', 'workingukc', 'undercoat', 'course', 'aggression', 'characteristic', 'brindle', 'heart', 'moderatebred', 'size', 'person', 'energy', 'breeding', 'mask', 'instinct', 'country', 'guard', 'purpose', 'personality', 'lowtendency', 'monument', 'ear', 'moderateclub', 'crossing', 'history', 'century', 'room', 'cat', 'name', 'bear', 'eye', 'training', 'hunting', 'happiness', 'parent', 'Recognition', 'prevalence', 'strength', 'kilogram', 'member', 'pinto', 'respect', 'bark', 'child', 'leg', 'home', 'job', 'chest', 'courage', 'police', 'Range', 'need', 'background', 'pair', 'protection', 'companion', 'status', 'symbol', 'averagelongevity', 'life', 'base', 'hair', 'top', 'neck',

139

# Interaction

### pseudocode:

    def interaction(r_engineer (obj), customer (obj), cooperation_factor):
        customer_nouns = customer.get_nouns()
        mutual_nouns = r_engineer.get_mutual_Nouns(customer_nouns)
        final_nouns = customer.create_final_nouns(mutual_nouns)
        r_engineer_nouns = r_engineer.get_nouns()
        final_nouns = customer.append_to_final_nouns(r_engineer_nouns, cooperation_factor)

        return final_nouns # list with nouns

# Experiment

# analyze results