## **1. Init**

In [None]:
import json

# TODO: Specify the path to the JSON file
json_file_path = "sd_pairs.json"

with open(json_file_path, 'r') as json_file:
    loaded_pairs = json.load(json_file)

## **2. Compute Contextual Deviation**

### **Setup environment**

In [None]:
!pip install transformers torch

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

### **Use CLIP to encode image**

In [None]:
from transformers import CLIPModel, CLIPProcessor

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

### **Calculate deviation**

In [None]:
def cosine_similarity(image1, image2):
    image_tensor1 = clip_processor(images=image1, return_tensors="pt").to(device)
    image_tensor2 = clip_processor(images=image2, return_tensors="pt").to(device)

    image_feature1 = clip_model.get_image_features(image_tensor1.pixel_values)
    image_feature2 = clip_model.get_image_features(image_tensor2.pixel_values)

    similarity = torch.nn.functional.cosine_similarity(image_feature1, image_feature2, dim=-1)

    return similarity.item()

In [None]:
from PIL import Image

for i, pair in enumerate(loaded_pairs):
    original_image = Image.open(loaded_pairs[i][1])
    generated_image = Image.open(loaded_pairs[i][4])

    similarity_score = cosine_similarity(original_image, generated_image)
    deviation_value = 1 - similarity_score

    pair.append(deviation_value)

In [None]:
# TODO: Specify the path where you want to save the JSON file
json_file_path = "sd_deviation.json"

with open(json_file_path, 'w') as json_file:
    json.dump(loaded_pairs, json_file)

print(f"Deviation saved to {json_file_path}")

## **3. Context-Matching Model**

### **Setup**

In [None]:
import json

# TODO: Specify the path to the JSON file
json_file_path = "sd_deviation.json"

with open(json_file_path, 'r') as json_file:
    pairs = json.load(json_file)

In [None]:
X = []
y = []

for i, pair in enumerate(pairs):
      if type(pairs[i][5]) is float:
          X.append(pairs[i][5])
          y.append(pairs[i][3])

In [None]:
import pandas as pd

df = pd.DataFrame({'Label': y, 'Deviation Value': X})
df.head()

### **3.1 Train in whole data**

#### **Split into training and validation sets**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

label_mapping = {'NOOC': 0, 'OOC': 1}
y = np.array([label_mapping[label] for label in y])
X = np.array(X).reshape(-1, 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### **Test algorithms**

##### **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = SVC()
svm_model.fit(X_train, y_train)

predictions = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

##### **ANN**

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(1,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded))

test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

### **3.2 Train in mean set**

#### **Calculate mean**

In [None]:
from collections import defaultdict

sets_data = defaultdict(lambda: {'sum': 0, 'count': 0, 'label': ''})

for sublist in pairs:
    identifier = sublist[0]
    label = sublist[3]

    if label in ['OOC', 'NOOC']:
        value = sublist[-1]
        sets_data[identifier]['sum'] += value
        sets_data[identifier]['count'] += 1
        sets_data[identifier]['label'] = label

means = {identifier: {'mean': data['sum'] / data['count'] if data['count'] > 0 else 0, 'label': data['label']} for identifier, data in sets_data.items()}

In [None]:
import numpy as np

X_mean = []
y = []

for identifier, data in sets_data.items():
    mean_value = data['sum'] / data['count'] if data['count'] > 0 else 0
    label = data['label']

    X_mean.append(mean_value)
    y.append(label)

y = np.array(y)

In [None]:
import pandas as pd

df = pd.DataFrame({'Label': y, 'Deviation Value': X_mean})
df.head()

#### **Split into training and validation sets**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

label_mapping = {'NOOC': 0, 'OOC': 1}
y = np.array([label_mapping[label] for label in y])
X = np.array(X_mean).reshape(-1, 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### **Test algorithms**

##### **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = SVC()
svm_model.fit(X_train, y_train)

predictions = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

##### **ANN**

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(1,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

nn_model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded))

test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

### **Save model**

In [None]:
import joblib

joblib.dump(svm_model, 'context_matching_model.pkl')