In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

import pandas as pd


In [18]:
asag_df = pd.read_csv('../Data/merged.csv')

In [4]:
asag_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2442 entries, 0 to 2441
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Question_ID   2442 non-null   float64
 1   Question      2442 non-null   object 
 2   Model_Answer  2442 non-null   object 
 3   Answer        2442 non-null   object 
 4   Score         2442 non-null   float64
dtypes: float64(2), object(3)
memory usage: 95.5+ KB


In [5]:
asag_df['Model_Answer'][0]

'To simulate the behaviour of portions of the desired software product. \n'

In [8]:
# average num of words in the Model_Answer 
total = 0
for i in range(len(asag_df)):
    list = asag_df['Model_Answer'][i].split()
    lenght = len(list)
    total += lenght

avg = total/len(asag_df)
print(avg)


14.617936117936118


In [9]:
asag_df['Answer'] = asag_df['Answer'].apply(lambda x: x.lower())
asag_df['Model Answer'] = asag_df['Model_Answer'].apply(lambda x: x.lower())

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

from sentence_transformers import SentenceTransformer

In [12]:
def process_bert_similarity(base_answer, sample_answer):
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    tokenize_base_answer = sent_tokenize(base_answer)
    base_answer_embedding = model.encode(tokenize_base_answer)
    base_answer_embedding_mean = np.mean(np.array(base_answer_embedding), axis=0)

    tokenize_sample_answer = sent_tokenize(sample_answer)
    sample_answer_embedding = model.encode(tokenize_sample_answer)
    sample_answer_embedding_mean = np.mean(np.array(sample_answer_embedding), axis=0)

    cosine_similarity_score = cosine_similarity([base_answer_embedding_mean], [sample_answer_embedding_mean]).flatten()

    print(cosine_similarity_score)

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [41]:
# Spacy for entity recognition in the answers
import spacy

# recognize entities in the asag_df
nlp = spacy.load('en_core_web_sm')


def process_spacy_entities(answer):
    print(answer)
    doc = nlp(answer)
    entities = [(X.text, X.label_) for X in doc.ents]
    return entities

process_spacy_entities("There are three types of software engineering: Agile, Waterfall and Spiral")


There are three types of software engineering: Agile, Waterfall and Spiral


[('three', 'CARDINAL'), ('Spiral', 'PRODUCT')]

In [13]:
modelAnswer = asag_df['Model_Answer'][0]
sampleAnswer = asag_df['Answer'][0]

process_bert_similarity(modelAnswer, sampleAnswer)

[0.76450104]


In [19]:
print(asag_df['Question'][0])
print(asag_df['Model_Answer'][0])
print(asag_df['Answer'][0])
print(asag_df['Score'][0])

What is the role of a prototype program in problem solving?

To simulate the behaviour of portions of the desired software product. 

High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.<br><br>

3.5


In [81]:
# define column Answer_Embeddings - type numpyArray
asag_df['Answer_Embeddings'] = [[] for i in range(len(asag_df))]


In [43]:
dataFrame = pd.DataFrame({'Cosine_Similarity'})

for i in range(100):
    # add a row to the dataframe
    cosine_simi = 9.44
    dataFrame = dataFrame.append({'Cosine_Similarity': cosine_simi}, ignore_index=True)


10.0


In [67]:
# Regression Model to predict the score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(asag_df['Answer'], asag_df['Score'], test_size=0.2, random_state=42)


# tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)


# convert the text to sequence of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


# pad the sequence to the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# convert the score to one-hot encoding
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

print(y_train_onehot.shape)


# create the model
model = Sequential()
model.add(Embedding(input_dim=100, output_dim=2, input_length=100))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(y_train_onehot.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())


# train the model
model.fit(X_train_pad, y_train_onehot, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


# evaluate the model
y_pred = model.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)


# calculate the accuracy
accuracy = r2_score(y_test, y_pred)
print(accuracy)


# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(mse)

(1953, 6)
Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_21 (Embedding)    (None, 100, 2)            200       
                                                                 
 bidirectional_18 (Bidirecti  (None, 128)              34304     
 onal)                                                           
                                                                 
 dropout_18 (Dropout)        (None, 128)               0         
                                                                 
 dense_19 (Dense)            (None, 6)                 774       
                                                                 
Total params: 35,278
Trainable params: 35,278
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
E

In [72]:
print(y_train_onehot)

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


In [73]:
#predict for new Answer
new_answer = "The model is a good model"
new_answer_seq = tokenizer.texts_to_sequences([new_answer])
new_answer_pad = pad_sequences(new_answer_seq, maxlen=100)


# predict the score
score = model.predict(new_answer_pad)
print(score)


score = np.argmax(score, axis=0)

[[0.01337584 0.02832213 0.08284079 0.1348912  0.13908458 0.6014855 ]]
