In [1]:
# Uninstall the current version of TensorFlow
#!pip uninstall tensorflow -y

# Install TensorFlow 2.9.0
#!pip install tensorflow==2.9.0
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


2.9.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')
# Read the data
data = pd.read_csv("/content/drive/MyDrive/3CS/relations.csv")
data.head(10)

Mounted at /content/drive


Unnamed: 0,Id,stakholder,information element,relation type
0,0,customer,payment information,obligatory
1,1,customer,personal information,production
2,2,customer,cookies,optional
3,3,customer,accessing personal information,undecided
4,4,customer,additional data,production
5,5,company,log information,production
6,6,company,personal data,optional
7,7,company,additional data,optional
8,8,company,why collecting account information,production
9,9,privacy team,why collecting account information,production


In [3]:
# Encode categorical variables
label_encoders = {}
for column in ["stakholder", "information element", "relation type"]:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Combine stakeholder and information element
X_text = data["stakholder"].astype(str) + " " + data["information element"].astype(str)

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text)
X = tokenizer.texts_to_sequences(X_text)
X = pad_sequences(X)

In [4]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, data["relation type"], test_size=0.2, random_state=42)


In [5]:
# Define the model architecture
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
max_length = X.shape[1]
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(data["relation type"].unique()), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [6]:
# Define callbacks
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 100)            24000     
                                                                 
 bidirectional (Bidirectiona  (None, 2, 256)           234496    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 5)                 3

In [7]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), callbacks=[model_checkpoint])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
import timeit
from sklearn.metrics import classification_report

# Measure time taken for prediction
start_time = timeit.default_timer()
Y_pred = model.predict(X_val)
prediction_time = timeit.default_timer() - start_time

# Convert predicted labels to original classes
Y_pred_labels = np.argmax(Y_pred, axis=1)
Y_pred_labels = label_encoders["relation type"].inverse_transform(Y_pred_labels)

# Convert true labels to original classes
y_val_labels = label_encoders["relation type"].inverse_transform(y_val)

# Print prediction time
print("Prediction Time:", prediction_time)

# Print classification report
print(classification_report(y_val_labels, Y_pred_labels, zero_division=0))

Prediction Time: 1.889587307999932
              precision    recall  f1-score   support

  obligatory       0.56      0.50      0.53        10
    optional       0.43      0.24      0.31        25
  production       0.45      0.77      0.57        22
   undecided       0.00      0.00      0.00         4

    accuracy                           0.46        61
   macro avg       0.36      0.38      0.35        61
weighted avg       0.43      0.46      0.42        61



In [9]:
# Save the trained model
model.save('/content/drive/MyDrive/3CS/relation_model/relation_extraction_model.h5')

In [10]:
# Save the Tokenizer and LabelEncoders
np.save('/content/drive/MyDrive/3CS/relation_model/tokenizer_word_index.npy', tokenizer.word_index)
np.save('/content/drive/MyDrive/3CS/relation_model/label_encoders.npy', label_encoders)

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model
model = load_model('/content/drive/MyDrive/3CS/relation_model/relation_extraction_model.h5')

# Load the Tokenizer and LabelEncoders
tokenizer_word_index = np.load('/content/drive/MyDrive/3CS/relation_model/tokenizer_word_index.npy', allow_pickle=True).item()
label_encoders = np.load('/content/drive/MyDrive/3CS/relation_model/label_encoders.npy', allow_pickle=True).item()

# Function to preprocess new pairs of stakeholder and information elements
def preprocess_new_pairs(stakeholders, info_elements):
    X_text = stakeholders.astype(str) + " " + info_elements.astype(str)
    return X_text

# Function to tokenize and pad sequences
def tokenize_and_pad_sequences(text_data, tokenizer, max_length):
    X = tokenizer.texts_to_sequences(text_data)
    X = pad_sequences(X, maxlen=max_length)
    return X

# Function to predict relation type between new pairs
def predict_relation_type(stakeholders, info_elements, max_length):
    # Preprocess new pairs
    X_text = preprocess_new_pairs(stakeholders, info_elements)

    # Tokenize and pad sequences
    X = tokenize_and_pad_sequences(X_text, tokenizer, max_length)

    # Make predictions
    Y_pred = model.predict(X)

    # Convert predicted labels to original classes
    Y_pred_labels = np.argmax(Y_pred, axis=1)
    Y_pred_labels = label_encoders["relation type"].inverse_transform(Y_pred_labels)

    return Y_pred_labels


# Example usage:
new_stakeholders = pd.Series(["customer", "company"])
new_info_elements = pd.Series(["payement information", "personal information"])
max_length = 2
predicted_relations = predict_relation_type(new_stakeholders, new_info_elements,max_length)
print("Predicted Relations:", predicted_relations)


Predicted Relations: ['production' 'production']


In [12]:
# Define a function to preprocess new stakeholder and information element pairs
def preprocess_new_data(stakeholder, information_element):
    text = str(stakeholder) + " " + str(information_element)
    X_new = tokenizer.texts_to_sequences([text])
    X_new = pad_sequences(X_new, maxlen=max_length)
    return X_new

# Define a function to predict relation types for a list of pairs
def predict_relation_for_pairs(stakeholders, information_elements):
    predicted_relations = []
    for stakeholder, information_element in zip(stakeholders, information_elements):
        X_new = preprocess_new_data(stakeholder, information_element)
        predicted_probabilities = model.predict(X_new)
        predicted_relation_index = np.argmax(predicted_probabilities)
        predicted_relation = label_encoders["relation type"].inverse_transform([predicted_relation_index])[0]
        predicted_relations.append(predicted_relation)
    return predicted_relations

# Example of using the model to predict relation for a list of pairs
stakeholders = ["user", "user", "admin", "customer", "developer", "manager", "user"]
information_elements = ["log information", "personal data", "access control", "financial records", "product specifications", "security protocols", "user feedback"]

predicted_relations = predict_relation_for_pairs(stakeholders, information_elements)

for stakeholder, information_element, predicted_relation in zip(stakeholders, information_elements, predicted_relations):
    print(f"Stakeholder: {stakeholder}, Information Element: {information_element}, Predicted Relation: {predicted_relation}")

Stakeholder: user, Information Element: log information, Predicted Relation: production
Stakeholder: user, Information Element: personal data, Predicted Relation: production
Stakeholder: admin, Information Element: access control, Predicted Relation: production
Stakeholder: customer, Information Element: financial records, Predicted Relation: production
Stakeholder: developer, Information Element: product specifications, Predicted Relation: production
Stakeholder: manager, Information Element: security protocols, Predicted Relation: production
Stakeholder: user, Information Element: user feedback, Predicted Relation: production


In [13]:
# Define a function to preprocess new stakeholder and information element pairs
def preprocess_new_data(stakeholder, information_element):
    text = str(stakeholder) + " " + str(information_element)
    X_new = tokenizer.texts_to_sequences([text])
    X_new = pad_sequences(X_new, maxlen=max_length)
    return X_new

# Example of using the model to predict relation for a new pair
new_stakeholder = "company"
new_information_element = "log information"
X_new = preprocess_new_data(new_stakeholder, new_information_element)
predicted_probabilities = model.predict(X_new)
predicted_relation_index = np.argmax(predicted_probabilities)
predicted_relation = label_encoders["relation type"].inverse_transform([predicted_relation_index])[0]
print("Predicted Relation:", predicted_relation)

Predicted Relation: production
