# Necessary Information

## Dataset

### install dataset

In [None]:
! pip install datasets

## Library

### install gensim libraray

In [None]:
# Install gensim library to load pretrained word embedding model 
! pip install gensim

In [None]:
import os
import json

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import joblib
import matplotlib.pyplot as plt

from datasets import load_dataset

import seaborn as sns

## Load dataset

In [None]:
dataset = load_dataset("few_rel", "default")

## Constants used by functions

In [None]:
kernel = 'rbf' # ['linear', 'poly', 'rbf', 'sigmoid']
dimension = 300 # For glove model, dimension range is [50, 100, 200, 300]; for word2vec model, the dimension is 300
model_name = 'fasttext' 

MAX_LEN = 40
pid2id = {}

# Word embedding Model file path
file_path = 'wiki.en.bin'

# Trained model file path
model_path = '.\svm_model.pkl' # Strictly set the path as this to store the best performed model, fasttext model.

pid2id_path = '.\svm_pid2id.json'

# For training other model, such as glove, using this path instead and change the kernel, dimension and model_name. 
# The model storing path will automatically generated.
# model_path = './{}_{}_{}.pkl'.format(model_name, dimension, kernel) 



In [None]:
# Download the model used for calculating word vectors
if not os.path.exists('wiki.en.zip'):
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
    !unzip wiki.en.zip

## Function

### restruct relation types

In [None]:
# Restruct relation types into discrete values in range [0,63] based on its order in the dataset
## Ex. P931 → 0
def re_id(x):
    if x in pid2id:
        return pid2id[x]
    pid2id[x] = len(pid2id)
    return pid2id[x]

### restruct dataset

In [None]:
# Restruct dataset
## Only save usedful information used in the following process
def restruct_dataset(x):
    head = [x['head']['indices'][0][0], x['head']['indices'][0][-1]]
    tail = [x['tail']['indices'][0][0], x['tail']['indices'][0][-1]]
    return pd.Series({
        "tokens": ' '.join(x['tokens']),
        "relation": re_id(x['relation']),
        "head": x['head']['text'],
        "tail": x['tail']['text']
        # "distance": abs(x['head']['indices'][0][0] - x['tail']['indices'][0][0])
    })

### word vector generation function

In [None]:
# Calculate word vectors in two parts, df['head'] and df['tail']
# use get_vector function to calculate each word vector of word in each entity 
# their average value is the word vector of this part
## each vector is 1*dimension
def getVector(df, label, result_label, my_model):
    """
    Get the vector of each entity in the dataframe.
 
    Args:
        df: dataframe
        label: the label of the entity
        result_label: the label of the result
        mymodel: the word embedding model
    
    Returns:
        df: the dataframe with the result
    """
    for i in range(len(df[label])):

        spl = df[label][i].split()
        number = 0
        part_vector = np.zeros(dimension)
        vector = np.zeros(dimension)
        for entity in spl:

            try:
                vector += my_model.get_word_vector(entity)
            except:
                vector += np.zeros(dimension)
            number += 1

        part_vector = [value / number for value in vector]
        df.at[i, result_label] = part_vector
    return df

# Dataset Processing

## Load chosen dataset and restruct it

In [None]:
# Choose one dataset in the fewre dataset as trainset and testset
df = pd.DataFrame(dataset['train_wiki'])
# Change the label positiion
df = df[["tokens", "relation", 'head', 'tail']]

In [None]:
# Restruct dataset using written function above 
df1 = df.apply(restruct_dataset, axis=1)

In [None]:
# Save the pid2id
with open(pid2id_path, 'w', encoding='utf-8') as f:
    f.write(json.dumps(pid2id, ensure_ascii=False))

## Set two new column to store word vectors processed later

In [None]:
# Randomly generated a column, x2 for storing word vectors of named recognized entities in df['tail']
df1['x2'] = df1.apply(lambda row: [row['head'], row['tail']], axis=1)

In [None]:
# Randomly generated a column x1 for storing word vectors of named recognized entities in df['head']
df1['x1'] = df1.apply(lambda row: f"{row['head']} {row['tail']}", axis=1)

# Generate Word Vectors

### load fasttext model

In [None]:
import fasttext
mymodel=fasttext.load_model(file_path)

### generate word vectors and sentence vectors

In [None]:
df1 = getVector(df1, 'head', 'x1',mymodel)
df1 = getVector(df1, 'tail', 'x2', mymodel)

In [None]:
# Generate sentence vector using the average value and set a new column, X, to store the result
df1['X'] = df1.apply(lambda row: [(x + y) / 2 for x, y in zip(row['x1'], row['x2'])], axis=1)

# Prepare Trainset and Testset

## X

In [None]:
X = list(df1['X'])

## y

In [None]:
y = df1['relation']

## Split trainset and testset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM Model

## Load model

In [None]:
model = SVC(kernel=kernel, probability=True, verbose=True)

## Train model

In [82]:
# Train model
history = model.fit(X_train, y_train)

## Save model

In [None]:
# Save the trained model in the local path as seted
joblib.dump(model, model_path)

## Load model

In [None]:
# Load the trained model in the local path for prediction
model = joblib.load(model_path)

## Prediction

In [None]:
y_pred = model.predict(X_test)

## Evaluation

In [None]:
# Accurary, precision, recall f1-score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred) #from sklearn.metrics

In [None]:
confusion_mat

In [None]:
# Confusion matrix in heatmap form
plt.figure(figsize=(50, 50))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues')
plt.show()

In [None]:
# Precision matrix
precision_mat = confusion_mat
sum_column = np.sum(precision_mat, axis=0)
precision_mat = precision_mat / sum_column

In [None]:
# Confusion matrix in heatmap form
plt.figure(figsize=(50, 50))
sns.heatmap(confusion_mat, annot=True, cmap='Blues')
plt.show()