In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
import pandas as pd
# TODO: Data lecture
# =========================================
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/3CS/relations.csv')

data.head(10)

Mounted at /content/drive


Unnamed: 0,Id,stakholder,information element,relation type
0,0,customer,payment information,obligatory
1,1,customer,personal information,production
2,2,customer,cookies,optional
3,3,customer,accessing personal information,undecided
4,4,customer,additional data,production
5,5,company,log information,production
6,6,company,personal data,optional
7,7,company,additional data,optional
8,8,company,why collecting account information,production
9,9,privacy team,why collecting account information,production


In [4]:
# Encode categorical variables
label_encoders = {}
for column in ["stakholder", "information element", "relation type"]:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [5]:
data.head(10)

Unnamed: 0,Id,stakholder,information element,relation type
0,0,29,215,0
1,1,29,217,2
2,2,29,197,1
3,3,29,185,4
4,4,29,186,2
5,5,26,212,2
6,6,26,216,1
7,7,26,186,1
8,8,26,238,2
9,9,35,238,2


In [6]:
X_text = data["stakholder"].astype(str) + " " + data["information element"].astype(str)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text)
X = tokenizer.texts_to_sequences(X_text)
X = pad_sequences(X)

In [7]:
print(X_text)

0      29 215
1      29 217
2      29 197
3      29 185
4      29 186
        ...  
296    26 209
297    26 219
298    26 228
299    29 193
300    26 193
Length: 301, dtype: object


In [8]:
# Step 3: Prepare Training Data
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, data["relation type"], test_size=0.2, random_state=42)


In [9]:
print(X_train)
print("-------------")
print(X_val)
print("-------------")
print(y_train)
print("-------------")
print(y_val)

[[  1   9]
 [  5  34]
 [  1 182]
 [ 17 109]
 [  2  66]
 [ 26  32]
 [  1 119]
 [  4  93]
 [ 19 166]
 [ 15  45]
 [ 26  22]
 [  1  89]
 [ 18  53]
 [  2 219]
 [  2 128]
 [  1 136]
 [  6   5]
 [  5 183]
 [ 36  84]
 [  2  86]
 [  6   2]
 [  7  60]
 [ 33  56]
 [ 19 189]
 [  5  35]
 [ 13  67]
 [  4  41]
 [  1   1]
 [ 15 113]
 [  2 129]
 [  1 173]
 [  3 106]
 [ 12  44]
 [  3  23]
 [  9 184]
 [  4 239]
 [ 33  57]
 [  1 211]
 [  4  28]
 [ 14  44]
 [  9  69]
 [  7  89]
 [ 70  38]
 [ 24 103]
 [  1 210]
 [  3  57]
 [  4  45]
 [  8 167]
 [  1  73]
 [  6 233]
 [ 11 143]
 [ 14  96]
 [  1  14]
 [  2  63]
 [  1 218]
 [  2 148]
 [  2  16]
 [  1  24]
 [  8 147]
 [  4  40]
 [  5   6]
 [  1  68]
 [  1 140]
 [ 25 194]
 [ 24  53]
 [  8 146]
 [  1 174]
 [ 13 224]
 [ 19  28]
 [  9  31]
 [  6 216]
 [  1 195]
 [  2 127]
 [  6  25]
 [  2 132]
 [ 36  85]
 [ 15  82]
 [ 25 155]
 [ 11 169]
 [  1 171]
 [  3  20]
 [  1 130]
 [  2  15]
 [  6 170]
 [ 66  67]
 [  1 215]
 [ 74  10]
 [  1  70]
 [ 17 108]
 [ 16  20]
 [  1  60]

In [10]:
# Step 4: Model Training
# Define the model architecture
embedding_dim = 100
max_sequence_length = X.shape[1]
vocab_size = len(tokenizer.word_index) + 1
num_classes = len(data["relation type"].unique())

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(num_classes, activation='softmax'))

In [11]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [13]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78088c3d0220>

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 100)            24000     
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 5)                 325       
                                                                 
Total params: 66565 (260.02 KB)
Trainable params: 66565 (260.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
import timeit
from sklearn.metrics import classification_report

# Measure time taken for prediction
start_time = timeit.default_timer()
Y_pred = model.predict(X_val)
prediction_time = timeit.default_timer() - start_time

# Convert predicted labels to original classes
Y_pred_labels = np.argmax(Y_pred, axis=1)
Y_pred_labels = label_encoders["relation type"].inverse_transform(Y_pred_labels)

# Convert true labels to original classes
y_val_labels = label_encoders["relation type"].inverse_transform(y_val)

# Print prediction time
print("Prediction Time:", prediction_time)

# Print classification report
print(classification_report(y_val_labels, Y_pred_labels, zero_division=0))


Prediction Time: 0.5199043960000154
              precision    recall  f1-score   support

  obligatory       0.38      0.50      0.43        10
    optional       0.50      0.36      0.42        25
  production       0.48      0.64      0.55        22
   undecided       1.00      0.25      0.40         4

    accuracy                           0.48        61
   macro avg       0.59      0.44      0.45        61
weighted avg       0.51      0.48      0.47        61



In [20]:
# Define a function to preprocess new stakeholder and information element pairs
def preprocess_new_data(stakeholder, information_element):
    text = str(stakeholder) + " " + str(information_element)
    X_new = tokenizer.texts_to_sequences([text])
    X_new = pad_sequences(X_new, maxlen=max_sequence_length)
    return X_new

# Example of using the model to predict relation for a new pair
new_stakeholder = "user"
new_information_element = "cookies"
X_new = preprocess_new_data(new_stakeholder, new_information_element)
predicted_probabilities = model.predict(X_new)
predicted_relation_index = np.argmax(predicted_probabilities)
predicted_relation = label_encoders["relation type"].inverse_transform([predicted_relation_index])[0]

print("Predicted Relation:", predicted_relation)

Predicted Relation: production


In [17]:
# Define a function to preprocess new stakeholder and information element pairs
def preprocess_new_data(stakeholder, information_element):
    text = str(stakeholder) + " " + str(information_element)
    X_new = tokenizer.texts_to_sequences([text])
    X_new = pad_sequences(X_new, maxlen=max_sequence_length)
    return X_new

# Define a function to predict relation types for a list of pairs
def predict_relation_for_pairs(stakeholders, information_elements):
    predicted_relations = []
    for stakeholder, information_element in zip(stakeholders, information_elements):
        X_new = preprocess_new_data(stakeholder, information_element)
        predicted_probabilities = model.predict(X_new)
        predicted_relation_index = np.argmax(predicted_probabilities)
        predicted_relation = label_encoders["relation type"].inverse_transform([predicted_relation_index])[0]
        predicted_relations.append(predicted_relation)
    return predicted_relations

# Example of using the model to predict relation for a list of pairs
stakeholders = ["company", "user", "admin"]
information_elements = ["log information", "personal data", "access control"]
predicted_relations = predict_relation_for_pairs(stakeholders, information_elements)

for stakeholder, information_element, predicted_relation in zip(stakeholders, information_elements, predicted_relations):
    print(f"Stakeholder: {stakeholder}, Information Element: {information_element}, Predicted Relation: {predicted_relation}")


Stakeholder: company, Information Element: log information, Predicted Relation: production
Stakeholder: user, Information Element: personal data, Predicted Relation: production
Stakeholder: admin, Information Element: access control, Predicted Relation: production
