In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
# Load the dataset
df = pd.read_csv('relations.csv')

# Display the first few rows of the dataframe
print(df.head())

   Id stakholder             information element  relation existence
0   0   customer             payment information                   1
1   1   customer            personal information                   1
2   2   customer                         cookies                   1
3   3   customer  accessing personal information                   0
4   4   customer                 additional data                   1


In [3]:
# Separate features and target
X = df[['information element', 'stakholder']]
y = df['relation existence']

In [4]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Define a logistic regression model
clf = LogisticRegression(max_iter=1000)

# Create a pipeline
pipeline = make_pipeline(vectorizer, clf)

# Balance the classes using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [5]:
# Fit the model
pipeline.fit(X_resampled['information element'] + ' ' + X_resampled['stakholder'], y_resampled)

In [6]:
# Predict on the test set
y_pred = pipeline.predict(X_test['information element'] + ' ' + X_test['stakholder'])

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test['information element'] + ' ' + X_test['stakholder'])[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.6698
Precision: 0.9365
Recall: 0.6556
F1 Score: 0.7712
ROC AUC Score: 0.8111


In [8]:
import joblib

# Save the pipeline
from google.colab import drive
drive.mount('/content/drive')
joblib.dump(pipeline, '/content/drive/My Drive/3CS/relation/relation_model.joblib')

Mounted at /content/drive


['/content/drive/My Drive/3CS/relation/relation_model.joblib']

In [10]:
# Load the saved pipeline
pipeline_loaded = joblib.load('/content/drive/My Drive/3CS/relation/relation_model.joblib')

# New pairs for prediction
new_pairs = pd.DataFrame({
    'information element': [ 'personal information'],
    'stakholder': [ 'customer']
})

# Make predictions on new pairs
predictions = pipeline_loaded.predict(new_pairs['information element'] + ' ' + new_pairs['stakholder'])

# Display predictions
print(predictions)

[1]
