In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U datasets
!pip install transformers sentence-transformers

In [1]:
import datasets
print(datasets.__version__)

2.17.1


In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [4]:
dataset = load_dataset("stsb_multi_mt", "en")
print(dataset)

Downloading readme:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/470k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


In [5]:
print("A sample from the STSB dataset's training split:")
print(dataset['train'][3])

A sample from the STSB dataset's training split:
{'sentence1': 'Three men are playing chess.', 'sentence2': 'Two men are playing chess.', 'similarity_score': 2.5999999046325684}


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):
        # Normalize the similarity scores in the dataset
        similarity_scores = [i['similarity_score'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in   zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

In [8]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [9]:
model = BertForSTS()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  )
)

In [10]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

In [11]:
train_ds = STSBDataset(dataset['train'])
val_ds = STSBDataset(dataset['dev'])

# Create a 90-10 train-validation split.
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

5,749 training samples
1,500 validation samples


In [12]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 4,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size # Use the same batch size
        )

In [13]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)
epochs = 8
# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [22]:
def train():
  seed_val = 42
  criterion = CosineSimilarityLoss()
  criterion = criterion.cuda()
  random.seed(seed_val)
  torch.manual_seed(seed_val)
  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  for epoch_i in range(0, epochs):
      total_train_loss = 0
      model.train()
      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):
          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)
          train_data = collate_fn(train_data)
          model.zero_grad()
          output = [model(feature) for feature in train_data]
          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)            
      model.eval()
      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0
      for val_data, val_label in tqdm(validation_dataloader):
          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)
          val_data = collate_fn(val_data)
          with torch.no_grad():        
              output = [model(feature) for feature in val_data]
          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()
      avg_val_loss = total_eval_loss / len(validation_dataloader)
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
          }
      )
  return model, training_stats

# Launch the training
model, training_stats = train()

100%|██████████| 719/719 [03:44<00:00,  3.21it/s]
100%|██████████| 188/188 [00:18<00:00, 10.01it/s]
100%|██████████| 719/719 [03:44<00:00,  3.21it/s]
100%|██████████| 188/188 [00:18<00:00,  9.97it/s]
100%|██████████| 719/719 [03:43<00:00,  3.22it/s]
100%|██████████| 188/188 [00:18<00:00,  9.95it/s]
100%|██████████| 719/719 [03:43<00:00,  3.22it/s]
100%|██████████| 188/188 [00:19<00:00,  9.87it/s]
100%|██████████| 719/719 [03:45<00:00,  3.19it/s]
100%|██████████| 188/188 [00:18<00:00,  9.90it/s]
100%|██████████| 719/719 [03:44<00:00,  3.20it/s]
100%|██████████| 188/188 [00:18<00:00,  9.95it/s]
100%|██████████| 719/719 [03:44<00:00,  3.21it/s]
100%|██████████| 188/188 [00:18<00:00,  9.89it/s]
100%|██████████| 719/719 [03:44<00:00,  3.21it/s]
100%|██████████| 188/188 [00:19<00:00,  9.89it/s]


In [21]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.026849,0.037488


In [18]:
# load the test set
test_dataset = load_dataset("stsb_multi_mt", name="en", split="test")

# Prepare the data
first_sent = [i['sentence1'] for i in test_dataset]
second_sent = [i['sentence2'] for i in test_dataset]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

model.eval()

def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
  return sim

In [19]:
example_1 = full_text[100]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: A cat is walking around a house.
Sentence 2: A woman is peeling potato.
Predicted similarity score: 0.03


In [None]:
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

In [None]:
PATH = '/kaggle/working/model_with_BERT'
torch.save(model.state_dict(), PATH)

In [None]:
!pip install pyPDF2

In [None]:
def extract_text(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        text = ""

        for i in range(num_pages):
            page = reader.pages[i]
            text += page.extract_text() + "\n"

        return text

In [None]:
def find_abstract_to_references(text):
    # Özet bölümünün başlangıcını ve kaynakça bölümünün sonunu bulmak için regüler ifadeler kullan.
    abstract_start_regex = r"(Abstract|Özet)"
    references_end_regex = r"(References|Kaynakça)"

    # Metinde özetin başlangıcını ve kaynakçanın sonunu bul.
    abstract_start_match = re.search(abstract_start_regex, text, re.IGNORECASE)
    references_end_match = re.search(references_end_regex, text, re.IGNORECASE)

    if abstract_start_match and references_end_match:
        start_index = abstract_start_match.start()
        end_index = references_end_match.end()
        return text[start_index:end_index]
    else:
        return "Abstract veya References bulunamadı."

In [None]:
# PDF dosya yolunu buraya girin
pdf_path = '/kaggle/input/artcle/Performance_analysis_of_TCP_incast_with_TCP_Lite_and_Abstract_TCP.pdf'  # Örnek: '/kaggle/input/artcle/Performance_analysis_of_TCP_incast_with_TCP_Lite_and_Abstract_TCP.pdf'
pdf_path2 = '/kaggle/input/artcle/TCP_Fairness_Among_Modern_TCP_Congestion_Control_Algorithms_Including_TCP_BBR.pdf'
pdf_path3 = '/kaggle/input/game-theo/A_Speed_Guide_Model_for_Collision_Avoidance_in_Non-Signalized_Intersections_Based_on_Reduplicate_Game_Theory.pdf'
pdf_path4 = '/kaggle/input/oyun-teo2/Device_to_Device_Communication_using_Stackelberg_Game_Theory_approach.pdf'
pdf_path5 = '/kaggle/input/haaaaa/Resource_Allocation_for_Device-to-Device_Communications_Underlaying_Heterogeneous_Cellular_Networks_Using_Coalitional_Games.pdf'  # Örnek: '/kaggle/input/artcle/Performance_analysis_of_TCP_incast_with_TCP_Lite_and_Abstract_TCP.pdf'


full_text_tcp = extract_text(pdf_path)
full_text_tcp2 = extract_text(pdf_path2)
full_text_GT = extract_text(pdf_path3)
full_text_GT2 = extract_text(pdf_path4)
full_text_GT3 = extract_text(pdf_path5)

In [None]:
# Abstract ile References arasındaki metni çıkar
abstract_to_references_tcp = find_abstract_to_references(full_text_tcp)
abstract_to_references_tcp2 = find_abstract_to_references(full_text_tcp2)
abstract_to_references_GT = find_abstract_to_references(full_text_GT)
abstract_to_references_GT2 = find_abstract_to_references(full_text_GT2)
abstract_to_references_GT3 = find_abstract_to_references(full_text_GT3)

In [None]:
sentences = [abstract_to_references_tcp,abstract_to_references_tcp2]
sentences2 = [abstract_to_references_tcp,abstract_to_references_GT]
sentences3 = [abstract_to_references_tcp2,abstract_to_references_GT]
sentences4 = [abstract_to_references_tcp,abstract_to_references_GT2]
sentences5 = [abstract_to_references_tcp2,abstract_to_references_GT2]
sentences6 = [abstract_to_references_GT2,abstract_to_references_GT]
sentences7 = [abstract_to_references_GT2,abstract_to_references_GT3]
sentences8 = [abstract_to_references_GT,abstract_to_references_GT3]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def calculate_similarity(sentences):
    embeddings = model.encode(sentences)
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity

In [None]:
similarity = calculate_similarity(sentences)
similarity2 = calculate_similarity(sentences2)
similarity3 = calculate_similarity(sentences3)
similarity4 = calculate_similarity(sentences4)
similarity5 = calculate_similarity(sentences5)
similarity6 = calculate_similarity(sentences6)
similarity7 = calculate_similarity(sentences7)
similarity8 = calculate_similarity(sentences8)