## Gerekliliklerin Projeye Dahil Edilmesi

In [None]:
!pip install transformers &> /dev/null
!pip install sentencepiece &> /dev/null
!pip install sentence_splitter &> /dev/null
print("[OK]")

[OK]


In [None]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_splitter import SentenceSplitter, split_text_into_sentences
import copy

In [None]:
import torch
import torch.nn as nn
import torch.utils.data
import time
import torch.nn.functional as F
import torchtext
from torchtext.legacy import data
from torchtext.legacy.data import Field, LabelField 
from torchtext.legacy.data import TabularDataset, BucketIterator

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
from google.colab import files

## Veri Kümesinden Örnek Seçimi

In [None]:
class SampleSelection(object):
  def __init__(self, min_sample_count=1000):
    self.min_sample_count = min_sample_count

  def generate_sub_set(self, ds):
    eval_str = f"ds.dataframe.{ds.label}.value_counts()"
    label_count = eval(eval_str)
    keys = label_count.keys()
    counts = np.asarray(label_count) / np.sum(label_count) * self.min_sample_count + 1
    counts = np.asarray(counts, dtype=int)
    print(f"Dataset Name     : {ds.name}\n"+
          f"Total count      : {np.sum(label_count)}\n"+
          f"Distribution     : {np.asarray(label_count)}\n"+
          f"Current count    : {np.sum(counts)}\n"+
          f"New Distribution : {counts}\n"+
          f"Label Names      : {list(keys)}\n"+
          f"Num of Label     : {len(keys)}\n\n")
    return self._generate(ds, keys, counts)

  def _generate(self, ds, keys, counts):
    new_df = pd.DataFrame(columns=[ds.text, ds.label])
    for i in range(len(keys)):
      temp = ds.dataframe[ds.dataframe[ds.label]==keys[i]].sample(counts[i], replace=False)
      new_df = new_df.append(temp, ignore_index=True)
    return new_df
    


## Veri Kümelerinin Normalizasyonu

In [None]:
class DatasetBase(object):
  def __init__(self, name, text, label, file_id, dataframe=None, coding="utf-8"):
    self.name = name
    self.text = text
    self.label = label
    self.file_id = file_id
    self.dataframe = dataframe
    self.coding = coding

    self.train_set = None
    self.test_set = None
    self.paraphrased_t5_set = None
    self.paraphrased_pg_set = None
    self.norm_set = None

  def set_dataframe(self, dataframe): 
    self.dataframe = dataframe

  def read_dataframe(self):
    self.dataframe = pd.read_csv(self.name, encoding=self.coding)

  def save_dataframe(self, name=None, prefix=None):
    if name==None: name = self.name
    if prefix==None: prefix = "Melora_Hardin"
    self.dataframe.to_csv(f"{prefix}_{name}")

  def download_dataframe(self, drive):
    downloaded = drive.CreateFile({'id': self.file_id})
    downloaded.GetContentFile(self.name)
    self.read_dataframe()

class DatasetNormalization(object):
  def __init__(self, text="text", label="label"):
    self.text = text
    self.label = label

  def get_features(self): return self.text, self.label

  def get_normalized_ds(self, dataset):
    dataframe =  dataset.dataframe.filter(items=[dataset.text, dataset.label])
    dataframe.rename({dataset.text: self.text, dataset.label: self.label}, axis=1, inplace=True)
    dataframe = dataframe[[self.text, self.label]]
    return dataframe


## Eğitim - Test Veri Kümelerinin Tanımlanması  

In [None]:
class GenerateTrainTestSet(object):
  def __init__(self,ds, norm_text="text", norm_label="label"):
    self.ds = ds
    self.ds_df = pd.read_csv(f"ds_{ds.name}", encoding=ds.coding)
    #self.ds_df.columns = [norm_label, norm_text]
    self.ds.dataframe = self.ds_df
    self.norm_df = pd.read_csv(f"norm_{ds.name}")
    self.t5_df = pd.read_csv(f"t5_{ds.name}")
    self.pg_df = pd.read_csv(f"pg_{ds.name}")
    self.norm_text = norm_text
    self.norm_label = norm_label
    # outputs
    # normal train and test set : ds - norm % per -> train + norm, test
    # pharaphrasing set         : ph % max k -> select k row by group max k row 


  def get_normal_set(self, per=0.4):
    normalization = DatasetNormalization()
    self.ds_df = normalization.get_normalized_ds(self.ds)
    self.ds.dataframe = self.ds_df
    self.ds.text = self.norm_text
    self.ds.label = self.norm_label

    eval_str = f"self.ds_df.{self.norm_text}"
    for index, row in self.norm_df.iterrows():  # sub set in train setten çıkarılması
      self.ds_df = self.ds_df.drop(self.ds_df[eval(eval_str)==row[self.norm_text]].index)
    print("Training Set")

    test_set = SampleSelection(min_sample_count=self.ds_df.shape[0] * per).generate_sub_set(self.ds)
    for index, row in test_set.iterrows():      # test set in train setten çıkarılması
      self.ds_df = self.ds_df.drop(self.ds_df[eval(eval_str)==row[self.norm_text]].index)
    for index, row in self.norm_df.iterrows():  # sub setin train sete eklenmesi
      self.ds_df = self.ds_df.append({self.ds.text:row[self.norm_text], 
                                      self.ds.label:row[self.norm_label]}, ignore_index=True)
    print(f"Train Set : {self.ds_df.shape[0]}\n"+
          f"Test Set  : {test_set.shape[0]}\n"+
          f"Norm Set  : {self.norm_df.shape[0]}\n")
    return self.ds_df, test_set, self.norm_df



  def get_paraphrased_set(self, per=[0.2, 0.5, 0.7, 1, 2, 5], k=5, prefix="t5"):
    less_one = list()
    less_per = list()
    one = pd.DataFrame(columns=[self.norm_text, self.norm_label])
    greater_one = list()
    greater_per = list()

    paraphrased_dataset = pd.read_csv(f"{prefix}_{self.ds.name}")

    for p in per:
      if p>=1: 
        greater_one.append(pd.DataFrame(columns=[self.norm_text, self.norm_label]))
        greater_per.append(p)
    
    # 1 ve 1 den büyük olanların seçilmesi
    temp_df = pd.DataFrame(columns=[self.norm_text, self.norm_label])
    for index, row in paraphrased_dataset.iterrows():
      temp_df = temp_df.append({self.norm_text:row[self.norm_text],
                                  self.norm_label:row[self.norm_label]}, ignore_index=True)
      if (index+1) % k == 0: # 5 lik row alındı
        
        for temp_index, temp_row in temp_df.iterrows():
          greater_index = 0
          for p in per: 
            if p>=1: 
              if temp_index < p:
                greater_one[greater_index] = greater_one[greater_index].append({self.norm_text:temp_row[self.norm_text],
                                      self.norm_label:temp_row[self.norm_label]}, ignore_index=True)
              greater_index += 1
                
          if temp_index == 0: 
            one = one.append({self.norm_text:temp_row[self.norm_text],
                                  self.norm_label:temp_row[self.norm_label]}, ignore_index=True)
          
        temp_df = pd.DataFrame(columns=[self.norm_text, self.norm_label])
        
        
    
    # 1 den düşük olan oranların seçilmesi
    temp_df = copy.deepcopy(self.ds)
    temp_df.dataframe = one
    temp_df.text = self.norm_text
    temp_df.label = self.norm_label
    
    for p in per: 
      if p<1: 
        print(p, temp_df.dataframe.shape[0])
        less_one.append(
            SampleSelection(min_sample_count=self.norm_df.shape[0]*p).
            generate_sub_set(temp_df)
        )

    output = list()
    i = 0
    print("Percentages and Numbers of Paraphrased Rows")
    for less in less_one:
      print(f"\t{per[i]} -> {less.shape[0]}")
      i += 1
      output.append(less)
    for greater in greater_one:
      print(f"\t{per[i]} -> {greater.shape[0]}")
      i += 1
      output.append(greater)
    return output

    

## T5 ve Pegasus Mimarilerinin Tanımlanması

In [None]:
class Paraphrase_T5(object):
  '''
    source: https://huggingface.co/hetpandya/t5-small-tapaco
  '''
  def __init__(self, device="cpu"):
    self.tokenizer = T5Tokenizer.from_pretrained("hetpandya/t5-small-tapaco")
    self.model = T5ForConditionalGeneration.from_pretrained("hetpandya/t5-small-tapaco")
    self.device = device


  def get(self, sentence, prefix="paraphrase: ", n_predictions=5, 
                      top_k=120, max_length=256, ):
    text = prefix + sentence + " </s>"
    encoding = self.tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)

    model_output = self.model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        do_sample=True,
        max_length=max_length,
        top_k=top_k,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=n_predictions,
    )
    return self._get(sentence, model_output, n_predictions)
    outputs = []
    for output in model_output:
      generated_sent = self.tokenizer.decode(
        output, skip_special_tokens=True, clean_up_tokenization_spaces=True
      )
      if (
        generated_sent.lower() != sentence.lower()
        and generated_sent not in outputs
        ):
        outputs.append(generated_sent)
    return outputs

  def _get(self, sentence, model_output, n):
    outputs = []
    for i in range(3):
      for output in model_output:
          generated_sent = self.tokenizer.decode(
              output, skip_special_tokens=True, clean_up_tokenization_spaces=True
          )
          if (
              generated_sent.lower() != sentence.lower()
              and generated_sent not in outputs
          ):
              outputs.append(generated_sent)
          if len(outputs) == n: return outputs
    return outputs

In [None]:
class Paraphrase_Pegasus(object):
  '''
    source: https://analyticsindiamag.com/how-to-paraphrase-text-using-pegasus-transformer/
  '''
  def __init__(self, device="cpu"):
    self.tokenizer = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
    self.model = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(device)
    self.splitter = SentenceSplitter(language='en')
    self.device = device

  def get(self, sentence, n_predictions=5, num_beams=10, max_length=256, ):
    sentence_list = self.splitter.split(sentence)
    output = []
    for i in range(n_predictions): output.append("")
    for sentence in sentence_list:
      batch = self.tokenizer([sentence],truncation=True,padding='longest',max_length=max_length, return_tensors="pt").to(self.device)
      translated = self.model.generate(**batch,max_length=max_length,num_beams=num_beams, num_return_sequences=n_predictions, temperature=1.5)
      tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
      for i in range(len(tgt_text)): output[i] += tgt_text[i] + " " 
    return output

  def _get(self):
    pass

## Yapay Örnek Üretimi

In [None]:
class ParaphraseBase(object):
  def __init__(self, architectures, k=5):
    self.k = k
    self.architectures = architectures

  def generate(self, ds):
    new_df = list()
    for arc in self.architectures: new_df.append(pd.DataFrame(columns=[ds.text, ds.label]))
    total_row = ds.dataframe[ds.label].count() + 1
    for curr_row, row in ds.dataframe.iterrows():
      print(f"\t[{curr_row}/{total_row}]: %{curr_row/total_row*100}")
      sentence_error = False
      for i_arch in range(len(self.architectures)):
        try:
          phrases = self.architectures[i_arch].get(row[ds.text], n_predictions=self.k)
        except: 
          sentence_error = True
          print(f"[X] Paraphrase Error in {curr_row}")
          phrases = self._pseudo_phrase()
        for phr in phrases:
          new_df[i_arch] = new_df[i_arch].append({ds.text:phr, ds.label:row[ds.label]}, ignore_index=True)
      if sentence_error: new_df = self._delete_last_sentences(new_df)
    return new_df

  def _pseudo_phrase(self):
    phrases = list()
    for i in range(self.k): phrases.append("Pseudo")
    return phrases

  def _delete_last_sentences(self, df_list):
    for df in df_list:
      df = df.drop(df.tail(self.k).index,inplace=True)
    return df_list




## Eğitim ve Test Süreci

### Tokenizer

In [None]:
from nltk.tokenize import TreebankWordTokenizer
import re
import nltk

tokenizer = TreebankWordTokenizer()

#stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

#lemmatizer
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def text_tokenization(text): 
    text = text.replace("n't", ' not')
    text = text.replace("'ve", ' have')
    text = text.replace("'m", ' am')
    text = text.replace("’s", ' ')
    text = text.replace("'s", ' ')
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub("[..,,ㆍ\'\"’‘”“!?\\‘|\(\)\[\]\<\>`\'◇…]", "", text)
    text = re.sub(r'[^a-zA-z]+', " ", text) # only english remain
    text = re.sub(" +", " ", text) #remove multi-space
    text = text.lower()
    return " ".join([stemmer.stem(word) for word in str(text).split() 
                     if word not in stop_words])

def cnn_tokenization(text):
    text = text.replace("n't", ' not')
    text = text.replace("'ve", ' have')
    text = text.replace("'m", ' am')
    text = text.replace("’s", ' ')
    text = text.replace("'s", ' ')
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub("[..,,ㆍ\'\"’‘”“!?\\‘|\(\)\[\]\<\>`\'◇…]", "", text)
    text = re.sub(r'[^a-zA-z]+', " ", text) # only english remain
    text = re.sub(" +", " ", text) #remove multi-space
    text = text.lower()
    return [stemmer.stem(word) for word in str(text).split() 
                     if word not in stop_words]

def only_lemmatization(text):
    text = text.replace("n't", ' not')
    text = text.replace("'ve", ' have')
    text = text.replace("'m", ' am')
    text = text.replace("’s", ' ')
    text = text.replace("'s", ' ')
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub("[..,,ㆍ\'\"’‘”“!?\\‘|\(\)\[\]\<\>`\'◇…]", "", text)
    text = re.sub(r'[^a-zA-z]+', " ", text) # only english remain
    text = re.sub(" +", " ", text) #remove multi-space
    text = text.lower()
    text = " ".join([lemma.lemmatize(word, pos='v') for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Model

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()           
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)       
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])       
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)    
        self.dropout = nn.Dropout(dropout)       
    def forward(self, text):     
        embedded = self.embedding(text)
        embedded = embedded.permute(1, 0, 2)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]     
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1)) 
        return self.fc(cat)

# Genel Metotlar

def categorical_accuracy(preds, y):
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, _ = batch.text
        predictions = model(text)
        loss = criterion(predictions, batch.label)
        acc = categorical_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def train_accuracy(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    #model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, _ = batch.text
        predictions = model(text)
        loss = criterion(predictions, batch.label)
        acc = categorical_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
          
            text, _ = batch.text
            predictions = model(text)
            loss = criterion(predictions, batch.label)
            acc = categorical_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
          
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Train ve Test

In [None]:
def perform_train(N_EPOCHS, model, train_iterator, optimizer, criterion, test_iterator):
  train_acc_list, test_acc_list = [], []
  best_valid_loss = float('inf')
  epoch_weight = dict()

  score_train = list()
  score_test = list()

  for epoch in range(N_EPOCHS):

      start_time = time.time()
      
      train_loss, train_acc = train(model, 
                                    train_iterator, 
                                    optimizer, 
                                    criterion)
      
      end_time = time.time()

      epoch_mins, epoch_secs = epoch_time(start_time, end_time)
      
      test_loss, test_accuracy = evaluate(model, test_iterator, criterion)
      
      train_acc_list.append(train_acc*100)
      test_acc_list.append(test_accuracy*100)

      print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss:.3f} | Train Accuracy: {train_acc*100:.2f}%')
      print(f'\tTest Loss :{test_loss} | Test Accuracy : {test_accuracy}')
  return train_acc_list, test_acc_list
  #torch.save(model.state_dict(), 'model_end.pt')


## Eğitim ve Test Süreci : Business Base

In [None]:
from sklearn.model_selection import train_test_split

class ModelTraining(object):
  def __init__(self, train, test, norm):
    self.epoch_3 = 4
    self.epoch_2 = 6
    self.epoch_1 = 12

    self.test = self._get_dataset_to_be_used(test)

    self._print(train)
    #self.train = self._get_dataset_to_be_used(train)
    #self._train_test(self.train, self.test, self.epoch_1, train)

    self._print(norm)
    self.norm = self._get_dataset_to_be_used(norm)
    self._train_test(self.norm, self.test, self.epoch_1, norm)

    self.norm_name = norm

    
  
  def start_analysis_for_a_percentage(self, name, p):
    # Önce -> 2
    self._print("Önce Norm : "+name)
    if p>=0:
      ds_norm = self._get_dataset_to_be_used(self.norm_name)
      m = self._train_test(ds_norm, self.test, self.epoch_2, name + " ÖNCE 1")
      ds_para = self._get_dataset_to_be_used(name)
      self._train_test(ds_para, self.test, self.epoch_2, name + " ÖNCE 2", pretrained_model=m)
      
      
      #m = self._train_test(ds_para, self.test, 3, name)
      #self._train_test(ds_norm, self.test, 3, name, model=m)
    #except: print("\n\t[X] ERROR because of INPUT SIZE of EMBEDDING_DIM")

    # Sonra -> 2
    self._print("Sonra Norm : "+name)
    if p>=0:
      
      ds_para = self._get_dataset_to_be_used(name)
      m = self._train_test(ds_para, self.test, self.epoch_2, name + " SONRA 1")
      
      ds_norm = self._get_dataset_to_be_used(self.norm_name)
      self._train_test(ds_norm, self.test, self.epoch_2, name+ " SONRA 2", pretrained_model=m)
    #except: print("\n\t[X] ERROR because of INPUT SIZE of EMBEDDING_DIM")

    # Karışık -> 1
    if True:
      self._print("Karışık : "+name)
      #try:
      norm = pd.read_csv(self.norm_name)[["text","label"]]
      para = pd.read_csv(name)[["text","label"]]
      norm = norm.append(para, ignore_index=True)
      norm.to_csv("temp_karisik.csv")
      ds_norm = self._get_dataset_to_be_used("temp_karisik.csv")
      self._train_test(ds_norm, self.test, self.epoch_1, name  + " KARIŞIK")
      #except: print("\n\t[X] ERROR because of INPUT SIZE of EMBEDDING_DIM")


    # Arada -> 3
    self._print("Arada : "+name)
    if p>=0:
      #try:
        para = pd.read_csv(name)[["text","label"]]
        para1, para2 = train_test_split(para, test_size=0.5)
        para1.to_csv("arada_ilk.csv")
        para2.to_csv("arada_son.csv")

        ds_first = self._get_dataset_to_be_used("arada_ilk.csv")
        m = self._train_test(ds_first, self.test, self.epoch_3, name + " ARADA 1")
        
        ds_norm = self._get_dataset_to_be_used(self.norm_name)
        m = self._train_test(ds_norm, self.test, self.epoch_3, name + " ARADA 2", pretrained_model=m)

        ds_last = self._get_dataset_to_be_used("arada_son.csv")
        self._train_test(ds_last, self.test, self.epoch_3, name + " ARADA 3", pretrained_model=m)
      #except: print("\n\t[X] ERROR because of INPUT SIZE of EMBEDDING_DIM")



  def _train_test(self, train, test, epoch, name, pretrained_model=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_iterator, test_iterator = data.BucketIterator.splits(
      (train, test), sort = False, repeat = False,
      sort_within_batch =False, batch_size = 32, device = device)
    
    INPUT_DIM = len(self.TEXT.vocab)
    EMBEDDING_DIM = 100
    N_FILTERS = 100
    FILTER_SIZES = [3,4,5]
    OUTPUT_DIM = len(self.LABEL.vocab)
    DROPOUT = 0.5
    PAD_IDX = self.TEXT.vocab.stoi[self.TEXT.pad_token]

    
    model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
    if pretrained_model != None:
      model.convs[0] = pretrained_model.convs[0]
      model.convs[1] = pretrained_model.convs[1]
      model.convs[2] = pretrained_model.convs[2] 
      model.fc = pretrained_model.fc

    import torch.optim as optim

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    criterion = criterion.to(device)

    train_acc, test_acc = perform_train(epoch, model, train_iterator, optimizer, criterion, test_iterator)
    print(f"\n# Generated for {name}")
    print("train_acc = ",train_acc)
    print("test_acc = ",test_acc)
    print("")

    return model


  def _get_dataset_to_be_used(self, name):
    self.TEXT = Field(sequential=True, tokenize=cnn_tokenization, lower = True, include_lengths=True)
    self.LABEL = LabelField(dtype=torch.long)
    Raw_datafields = [("Unnamed:0", None), ("text", self.TEXT), ("label", self.LABEL)]
    MAX_VOCAB_SIZE = 2000 #25000
    dataset = data.TabularDataset(
        path=f"./{name}",
        format='csv',
        skip_header=True, 
        fields=Raw_datafields) 
    self.TEXT.build_vocab(dataset, max_size = MAX_VOCAB_SIZE)
    self.LABEL.build_vocab(dataset)
    print(f"DATASET: {name}")
    print(vars(dataset[0]))
    print(self.TEXT.vocab.freqs.most_common(30))
    return dataset

  def _print(self, text):
    print("\n\n\n\n################################################################################################\n")
    print(f"[\_(O.O)_/] {text}\n\n")


## Proje Prosedürünün Yürütülmesi - Yapay Örnek Üretimi

In [None]:
# Datasetlerin tanımlaması - Eğer bir dataset ilk kez normalize edilecekse burası çalıştırılmalıdır
datasets = [
            #DatasetBase("news_category.csv", "short_description", "category", "1-y9Xf_q0QSmEGMYlI8wUhusMjyktzJxZ")
            # X DatasetBase("Corona_NLP_train.csv", "OriginalTweet", "Sentiment", "1usKN2yV6Wm7RftLJv-xsNTMysdEht9gO", coding="latin-1")
            #DatasetBase("tweets_disaster.csv", "text", "target", "1iMQCrNtDOTlea6j3ZdYe-JGs6R3MsOj8")
            #DatasetBase("financial_news_sentiment.csv", "text", "label", "1ngYA4Z7JomUhUZ6iUgcn-isj_-44VNLa", coding='cp437')
            DatasetBase("tweet_emotions_category.csv", "content", "sentiment", "1FDgaHTeF3wu6-B6jCfI5lBZJ0bB14zKb")
]

# Datasetlerin yüklenmesi ve normalizasyonu
normalization = DatasetNormalization()
for ds in datasets:
  ds.download_dataframe(drive)
  ds.set_dataframe(normalization.get_normalized_ds(ds))
  ds.text, ds.label = normalization.get_features()

In [None]:
# Datasetlerden alt kümelerin elde edilmesi
sampling = SampleSelection()
for ds in datasets:
  ds.dataframe = sampling.generate_sub_set(ds)
  ds.save_dataframe(prefix="norm")

Dataset Name     : tweet_emotions_category.csv
Total count      : 40000
Distribution     : [8638 8459 5209 5165 3842 2187 1776 1526 1323  827  759  179  110]
Current count    : 1007
New Distribution : [216 212 131 130  97  55  45  39  34  21  19   5   3]
Label Names      : ['neutral', 'worry', 'happiness', 'sadness', 'love', 'surprise', 'fun', 'relief', 'hate', 'empty', 'enthusiasm', 'boredom', 'anger']
Num of Label     : 13




In [None]:
# T5 ve Pegasus mimarilerinin ilklendirilmesi. 
# Not: Sadece bir kez çalıştırın. Her yürütme farklı hafıza blokları kullanır
# Aşağıdaki yapılar üzerinde düzenleme yaptıktan sonra bir sonraki garbage collector çalıştırılmalı
t5 = Paraphrase_T5()
pegasus = Paraphrase_Pegasus()

In [None]:
import gc
del t5
del pegasus
gc.collect()

In [None]:
# Yapay örnek üretimi ve türetilmiş datasetlerin kaydedilmesi
architectures = [t5, pegasus]
paraphrase = ParaphraseBase(architectures)
for ds in datasets:
  new_df = paraphrase.generate(ds)
  new_df[0].to_csv(f"t5_{ds.name}")
  new_df[1].to_csv(f"pg_{ds.name}")

In [None]:

drive.mount('/content/drive')
for ds in datasets:
  new_df[0].to_csv(f"/content/drive/My Drive/t5_{ds.name}")
  new_df[1].to_csv(f"/content/drive/My Drive/pg_{ds.name}")

## Proje Prosedürünün Yürütülmesi - Veri Setlerin Hazırlanması

In [None]:
# 5 li parçalar halinde dataframe rowlarını ayır, her 5 lik parça bir sentencedir
# 1, 2 ve 5 inin alındığı kısımları oluştur
# 1 lik kısımların %20, %50 sini seçip yeni bir set oluştur 
# kullanılan datasetlerin test setlerini oluştur
# teker teker eğitmek için gerekli yapıyı kur, her epochtaki train test al

In [None]:
# Bu bölümde datasetlerin train-test olarak ayrılması ve eğitim yer alır
downloaded = drive.CreateFile({'id': '1teFlXpOe0DKJoZStw3ORXpVKSN0O-x3y'})
downloaded.GetContentFile("paraphrasing_dataset.rar")
!unrar e paraphrasing_dataset.rar

In [None]:
datasets = [
            #DatasetBase("news_category.csv", "short_description", "category", "1-y9Xf_q0QSmEGMYlI8wUhusMjyktzJxZ"),
            #DatasetBase("Corona_NLP_train.csv", "OriginalTweet", "Sentiment", "1usKN2yV6Wm7RftLJv-xsNTMysdEht9gO", coding="latin-1"),
            DatasetBase("tweets_disaster.csv", "text", "target", "1iMQCrNtDOTlea6j3ZdYe-JGs6R3MsOj8"),
            #DatasetBase("financial_news_sentiment.csv", "text", "label", "1ngYA4Z7JomUhUZ6iUgcn-isj_-44VNLa", coding='cp437'),
            #DatasetBase("tweet_emotions_category.csv", "content", "sentiment", "1FDgaHTeF3wu6-B6jCfI5lBZJ0bB14zKb"),
            #DatasetBase("google_appstore_reviews.csv", "content", "score", "1FDgaHTeF3wu6-B6jCfI5lBZJ0bB14zKb")
]

paraphrase_percent = [0.2, 0.5, 0.7, 1, 2, 5]
k = 5
per=0.3

def save_paraphrased_set(name, phr_set, percent, k, arch_name):
  for i in range(len(percent)):
    phr_set[i].to_csv(f"phr_{arch_name}_{name}_k{k}_p{percent[i]}.csv")

for ds in datasets:
  set_generation = GenerateTrainTestSet(ds)

  paraphrased_t5_set = set_generation.get_paraphrased_set(per=paraphrase_percent, k=k, prefix="t5")
  save_paraphrased_set(ds.name, paraphrased_t5_set, paraphrase_percent, k, "t5")

  paraphrased_pg_set = set_generation.get_paraphrased_set(per=paraphrase_percent, k=k, prefix="pg")
  save_paraphrased_set(ds.name, paraphrased_pg_set, paraphrase_percent, k, "pg")

  train_set, test_set, norm_set = set_generation.get_normal_set(per=per)
  train_set.to_csv(f"train_per{per}_{ds.name}")
  test_set.to_csv(f"test_per{per}_{ds.name}")

  ds.train_set = train_set
  ds.test_set = test_set
  ds.paraphrased_t5_set = paraphrased_t5_set
  ds.paraphrased_pg_set = paraphrased_pg_set
  ds.norm_set = norm_set
  
  


0.2 959
Dataset Name     : tweets_disaster.csv
Total count      : 959
Distribution     : [546 413]
Current count    : 201
New Distribution : [114  87]
Label Names      : [0, 1]
Num of Label     : 2


0.5 959
Dataset Name     : tweets_disaster.csv
Total count      : 959
Distribution     : [546 413]
Current count    : 501
New Distribution : [285 216]
Label Names      : [0, 1]
Num of Label     : 2


0.7 959
Dataset Name     : tweets_disaster.csv
Total count      : 959
Distribution     : [546 413]
Current count    : 701
New Distribution : [399 302]
Label Names      : [0, 1]
Num of Label     : 2


Percentages and Numbers of Paraphrased Rows
	0.2 -> 201
	0.5 -> 501
	0.7 -> 701
	1 -> 959
	2 -> 1918
	5 -> 4795
0.2 969
Dataset Name     : tweets_disaster.csv
Total count      : 969
Distribution     : [555 414]
Current count    : 201
New Distribution : [115  86]
Label Names      : [0, 1]
Num of Label     : 2


0.5 969
Dataset Name     : tweets_disaster.csv
Total count      : 969
Distribution     :

## Proje Prosedürünün Yürütülmesi - Eğitim ve Test

In [None]:
for ds in datasets:
  print("#######################################################################\n"+
        "##\n"+
        "##\t\tENSEMBLE LEARNING\n"+
        "##\n"+
        "#######################################################################\n\n")
  train_ds_name = f"train_per{per}_{ds.name}"
  test_ds_name = f"test_per{per}_{ds.name}"
  norm_ds_name = f"norm_{ds.name}"
  train_start = ModelTraining(train_ds_name, test_ds_name, norm_ds_name)

  for arch in ["t5", "pg"]:
    for p in paraphrase_percent:
      phr_ds_name = f"phr_{arch}_{ds.name}_k{k}_p{p}.csv"
      train_start.start_analysis_for_a_percentage(phr_ds_name, p)
      



#######################################################################
##
##		ENSEMBLE LEARNING
##
#######################################################################


DATASET: test_per0.3_tweets_disaster.csv
{'text': ['way', 'cannot', 'eat', 'shit'], 'label': '0'}
[('fire', 99), ('like', 90), ('get', 79), ('via', 59), ('bomb', 59), ('year', 57), ('one', 57), ('news', 53), ('new', 51), ('flood', 47), ('kill', 45), ('peopl', 45), ('storm', 45), ('video', 44), ('day', 43), ('go', 41), ('attack', 41), ('time', 41), ('would', 40), ('build', 40), ('love', 39), ('crash', 39), ('look', 38), ('disast', 38), ('come', 38), ('still', 38), ('us', 37), ('malaysia', 37), ('burn', 36), ('evacu', 36)]




################################################################################################

[\_(O.O)_/] train_per0.3_tweets_disaster.csv






################################################################################################

[\_(O.O)_/] norm_tweets_disaster.csv


DATASET: 