In [27]:
import os
import os.path
import random
import itertools
from tqdm import tqdm
from typing import List
from string import punctuation
import seaborn as sns
from nltk.tokenize import WordPunctTokenizer
from scipy.spatial import distance
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
from matplotlib import pyplot as plt

In [28]:
BATCH_SIZE = 64
DEVICE = 'cpu'

In [29]:
class Preprocessor:
    def __init__(self, folders, train=True):
        self.train = train
        self._folders = folders
        self._embeddings_files = torch.tensor([])
        self._embeddings_plagiat1 = torch.tensor([])
        self._embeddings_plagiat2 = torch.tensor([])
        self.tk = WordPunctTokenizer()
        self._all_data = []
        self.files = []
        self.plagiat1 = []
        self.plagiat2 = []
        self.test_sample = torch.tensor([])
      
    def _clean_punctuation(self, punct_lst:str = punctuation):
        for letter in punct_lst:
            self.text = [line.replace(letter,  '')  for line in self.text]

    def _get_vec(self, txt: List[str]): 
        vec = torch.tensor([[0] * 300])
        for word in txt:
            vec += self._model[word]
        return vec/len(txt) if len(txt)!=0 else vec



     
    def _preprocess(self):
        if self.train:
          self.tmp_folder = self.files
          for idx,folder in enumerate(self._folders):
              for j,filename in enumerate(os.listdir(folder)):
                  full_filename = os.path.join(folder, filename)
                  self.text = open(full_filename, 'r+').readlines()
                  self.text = [line[:-2].lower() for line in self.text]
                  self._clean_punctuation()
                  self.text = [self.tk.tokenize(row) for row in self.text]
                  self._all_data += self.text
                  self.tmp_folder.append(self.text)
              if idx == 0:
                  self.tmp_folder = self.plagiat1
              elif idx == 1:
                  self.tmp_folder = self.plagiat2

          self.plagiat1 = self.plagiat1[1:]
          self.plagiat2 = self.plagiat2[1:]
          self.lst_folders = [self.files, self.plagiat1, self.plagiat2]
          self.lst_tensors = [self._embeddings_files, self._embeddings_plagiat1, self._embeddings_plagiat2]


          self._model = Word2Vec(
            sentences = self._all_data,
            min_count=1,
            window=5,
            size=300,
            negative=10,
            alpha=0.03,
            min_alpha=0.0007,
            sample=6e-5,
            sg=1,
            workers=4
            )
          
          self._model.save('Word2Vec.model')

          for idx, folder in enumerate(self.lst_folders):
            self.tmp_folder = torch.tensor([])
            for j,text in enumerate(folder):
                chain_text = list(itertools.chain(*text))
                self._emb = self._get_vec(chain_text)
                self.tmp_folder = torch.cat((self.tmp_folder,self._emb))
            self.lst_tensors[idx] = self.tmp_folder
        
        else:
            folder = self._folders[-1]
            self._model = Word2Vec.load('Word2Vec.model')
            for filename in os.listdir(folder):
                  full_filename = os.path.join(folder, filename)
                  self.text = open(full_filename, 'r+').readlines()
                  self.text = [line[:-2].lower() for line in self.text]
                  self._clean_punctuation()
                  self.text = [self.tk.tokenize(row) for row in self.text]
                  self.vec = self._get_vec(self.text)
                  self.test_sample = torch.cat((self.test_sample, self.vec))

    def preprocessing(self):
        self._preprocess()
        return self.lst_tensors





           

       



In [30]:
class TripletDataset(Dataset):
    def __init__(self, files:torch.Tensor, plagiat1:torch.Tensor, plagiat2:torch.Tensor):
        self.files = files
        self.plagiat = [plagiat1, plagiat2]
        self.length = len(files)
        self.anchor_box = torch.tensor([])
        self.positive_box = torch.tensor([])
        self.negative_box = torch.tensor([])
        self.make_samples()

    def make_samples(self):
        for idx, vec in enumerate(self.files):


            negative_files = torch.cat((self.files[:idx],self.files[idx+1:]))
            negative_plagiat1 = torch.cat((self.plagiat[0][:idx],self.plagiat[0][idx+1:]))
            negative_plagiat2 = torch.cat((self.plagiat[1][:idx],self.plagiat[1][idx+1:]))
            all_negative = torch.cat((negative_files,negative_plagiat1,negative_plagiat2))


            for plag in range(2):
                anchor = vec.reshape(1,-1)
                positive = self.plagiat[plag][idx].reshape(1,-1)
                negative_idx = random.randint(0,len(all_negative)-1)
                negative = all_negative[negative_idx].reshape(1,-1)
                self.anchor_box = torch.cat((self.anchor_box,anchor))
                self.positive_box = torch.cat((self.positive_box,positive))
                self.negative_box = torch.cat((self.negative_box,negative))


    def __len__(self):
        return len(self.anchor_box)
     
    def __getitem__(self, idx):
        return torch.vstack((self.anchor_box[idx], self.positive_box[idx], self.negative_box[idx]))







    

In [40]:
class Net(nn.Module):
    def __init__(self, dataset=None):
        super().__init__()
        self.fc1 = nn.Linear(300,512)
        self.fc2 = nn.Linear(512,1024)
        self.fc3 = nn.Linear(1024,4096)
        
    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)

        return x
    



In [10]:
lst = ['/content/files','/content/plagiat1','/content/plagiat2']

In [11]:
pp = Preprocessor(lst)

In [12]:
files, plagiat1, plagiat2 = pp.preprocessing()

  vec += self._model[word]


In [13]:
ds = TripletDataset(files, plagiat1, plagiat2)

In [None]:
net = Net(dataset=ds)
errors = []
dl = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
length_sample = len(dl)
criterion = nn.TripletMarginWithDistanceLoss(distance_function=(lambda x,y : 1.0 -F.cosine_similarity(x,y)), 
                                             margin = 1.0)
optim = torch.optim.Adam(net.parameters(),lr=1e-4)
for epoch in tqdm(range(300)):
    all_error = 0
    for ind,batch in enumerate(dl):
        optim.zero_grad()

        anchor = batch[:,0,:].to(torch.float).reshape(1,batch.shape[0],batch.shape[2])
        positive = batch[:,1,:].to(torch.float).reshape(1,batch.shape[0],batch.shape[2])
        negative = batch[:,2,:].to(torch.float).reshape(1,batch.shape[0],batch.shape[2])
        anchor = net(anchor)
        positive = net(positive)
        negative = net(negative)

        loss = criterion(anchor, positive, negative)
        all_error += loss

        loss.backward()
        optim.step()

    

    if epoch%10 == 0:
      print(all_error/length_sample)
    errors.append(all_error/length_sample)




    
    




  0%|          | 1/300 [00:02<11:01,  2.21s/it]

tensor(0.9992, grad_fn=<DivBackward0>)


  4%|▎         | 11/300 [00:20<09:07,  1.89s/it]

tensor(0.9608, grad_fn=<DivBackward0>)


  7%|▋         | 21/300 [00:38<08:06,  1.74s/it]

tensor(0.9142, grad_fn=<DivBackward0>)


 10%|█         | 31/300 [00:57<08:13,  1.84s/it]

tensor(0.8866, grad_fn=<DivBackward0>)


 14%|█▎        | 41/300 [01:15<07:59,  1.85s/it]

tensor(0.8119, grad_fn=<DivBackward0>)


 17%|█▋        | 51/300 [01:33<07:05,  1.71s/it]

tensor(0.8249, grad_fn=<DivBackward0>)


 20%|██        | 61/300 [01:50<06:36,  1.66s/it]

tensor(0.7252, grad_fn=<DivBackward0>)


 24%|██▎       | 71/300 [02:09<08:19,  2.18s/it]

tensor(0.8036, grad_fn=<DivBackward0>)


 27%|██▋       | 81/300 [02:31<07:46,  2.13s/it]

tensor(0.7619, grad_fn=<DivBackward0>)


 30%|███       | 91/300 [02:54<08:07,  2.33s/it]

tensor(0.7734, grad_fn=<DivBackward0>)


 34%|███▎      | 101/300 [03:18<08:20,  2.51s/it]

tensor(0.6635, grad_fn=<DivBackward0>)


 37%|███▋      | 111/300 [03:41<07:06,  2.26s/it]

tensor(0.8038, grad_fn=<DivBackward0>)


 40%|████      | 121/300 [04:05<07:00,  2.35s/it]

tensor(0.6581, grad_fn=<DivBackward0>)


 44%|████▎     | 131/300 [04:29<06:58,  2.48s/it]

tensor(0.5303, grad_fn=<DivBackward0>)


 47%|████▋     | 141/300 [04:54<06:51,  2.59s/it]

tensor(0.6715, grad_fn=<DivBackward0>)


 50%|█████     | 151/300 [05:19<06:13,  2.50s/it]

tensor(0.6335, grad_fn=<DivBackward0>)


 51%|█████▏    | 154/300 [05:26<05:58,  2.46s/it]