<a href="https://colab.research.google.com/github/Pavun-KumarCH/Research-Notebooks/blob/main/Anomaly_Detection_VDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Anomaly Detection

In [None]:
#@title requirements
%pip install --q pinecone sentence-transformers

In [None]:
import os
import time
import torch
from torch import nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, InputExample, models, util, losses

import warnings
warnings.filterwarnings('ignore')

# Environments
from google.colab import userdata
PINECONE_API_KEY = userdata.get("PINECONE_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")


In [None]:
# Set up Pinecone
pinecone = Pinecone(api_key = PINECONE_API_KEY)
INDEX_NAME = "ad-ai"

# Delete if already exsist
if INDEX_NAME in pinecone.list_indexes():
  pinecone.delete_index(INDEX_NAME)

# Create index
pinecone.create_index(
    name = INDEX_NAME,
    dimension = 256,
    spec = ServerlessSpec(cloud = 'aws', region = 'us-east-1'),
)
index = pinecone.Index(INDEX_NAME)

In [None]:
# Load the Dataset
!wget -q --show-progress -O training.tar.zip "https://www.dropbox.com/scl/fi/rihfngx4ju5pzjzjj7u9z/lesson6.tar.zip?rlkey=rct9a9bo8euqgshrk8wiq2orh&dl=1"

!tar -xzvf training.tar.zip

!tar -xzvf lesson6.tar

In [None]:
!head -5 sample.log

In [None]:
!head -5 training.txt

# Check cuda and Setup the Model
We are using bert-base-uncased sentence-transformers model that maps sentences to a 256 dimensional dense vector space.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length= 786)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

dense_model = models.Dense(in_features = pooling_model.get_sentence_embedding_dimension(), out_features = 256,
                           activation_function = nn.Tanh())

model = SentenceTransformer(modules = [word_embedding_model, pooling_model, dense_model], device = device)

device

#Train the Model

In [None]:
train_examples = []
with open('./training.txt', 'r') as f:
  lines = f.readlines()
  for line in lines:
    line = line.strip()
    if line:
      a, b, label = line.split('^')
      train_examples.append(InputExample(texts = [a, b], label = float(label)))

# Define dataset, the dataloader and the training loss
warmups_steps = 100
train_dataloader = DataLoader(train_examples, shuffle = True, batch_size = 16)
train_loss = losses.CosineSimilarityLoss(model)



> (Note: load_pretrained_model = True): We've saved the trained model and are loading it here for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, you may set load_pretrained_model to False to train the model yourself. This can take some time to finsih, depending the value you set for the epochs.



In [None]:
from re import L
import pickle

load_pretrained_model = True

if load_pretrained_model:
  trained_model_file = open("./data/pretrained_model", "rb")
  db = pickle.load(trained_model_file)
  trained_model_file.close()
else:
  model.fit(train_objectives = [(train_dataloader, train_loss)], epochs = 16, warmup_steps = 100)

samples = []
with open('sample.log', 'r') as f:
  lines = f.readlines()
  for line in lines:
    line = line.strip()
    if line:
      samples.append(line)