In [None]:
!pip install scikit-learn pandas tensorflow transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.0 MB/s[0m eta [36m0:00:

In [8]:
# Download required NLTK resources and import libraries

import pandas as pd
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset, TensorDataset
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saurav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saurav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Saurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Logistic Regression

In [None]:
# Load the dataset

train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the text data
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    preprocessed_tokens = [token.lower() for token in tokens if token.isalpha()]
    return ' '.join(preprocessed_tokens)

train_df['preprocessed_sentence1'] = train_df['sentence1'].apply(preprocess_text)
train_df['preprocessed_sentence2'] = train_df['sentence2'].apply(preprocess_text)
test_df['preprocessed_sentence1'] = test_df['sentence1'].apply(preprocess_text)
test_df['preprocessed_sentence2'] = test_df['sentence2'].apply(preprocess_text)

# Create feature vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['preprocessed_sentence1'] + ' ' + train_df['preprocessed_sentence2'])
X_test = vectorizer.transform(test_df['preprocessed_sentence1'] + ' ' + test_df['preprocessed_sentence2'])

# Scale the dense feature matrices (use with_mean=False to avoid centering sparse matrices)
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare the target variable
y_train = train_df['label']
y_test = test_df['label']

# Train a logistic regression model
lr_model = LogisticRegression(max_iter=5000)  # Increase the number of iterations
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
lr_accuracy = round(accuracy_score(y_test, y_pred),4)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.5119


### Siamese NN

In [18]:
# Configure GPU memory growth
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Load the dataset
train_df = pd.read_csv('train.tsv', sep='\t')
val_df = pd.read_csv('dev.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the text data
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply text preprocessing to the dataset
train_df['preprocessed_sentence1'] = train_df['sentence1'].apply(preprocess_text)
train_df['preprocessed_sentence2'] = train_df['sentence2'].apply(preprocess_text)
test_df['preprocessed_sentence1'] = test_df['sentence1'].apply(preprocess_text)
test_df['preprocessed_sentence2'] = test_df['sentence2'].apply(preprocess_text)

# Create vocabulary and tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['preprocessed_sentence1'] + train_df['preprocessed_sentence2'])

vocab_size = len(tokenizer.word_index) + 1

# Convert sentences to sequences
train_seq1 = tokenizer.texts_to_sequences(train_df['preprocessed_sentence1'])
train_seq2 = tokenizer.texts_to_sequences(train_df['preprocessed_sentence2'])
test_seq1 = tokenizer.texts_to_sequences(test_df['preprocessed_sentence1'])
test_seq2 = tokenizer.texts_to_sequences(test_df['preprocessed_sentence2'])

# Pad sequences
max_seq_length = 50
train_seq1 = pad_sequences(train_seq1, maxlen=max_seq_length, padding='post')
train_seq2 = pad_sequences(train_seq2, maxlen=max_seq_length, padding='post')
test_seq1 = pad_sequences(test_seq1, maxlen=max_seq_length, padding='post')
test_seq2 = pad_sequences(test_seq2, maxlen=max_seq_length, padding='post')

# Prepare the target variable
y_train = train_df['label']
y_test = test_df['label']

# Siamese neural network model
embedding_dim = 100
lstm_units = 64

input1 = Input(shape=(max_seq_length,))
input2 = Input(shape=(max_seq_length,))

# Embedding layer to convert words to dense vectors
embedding_layer = Embedding(vocab_size, embedding_dim)

# LSTM layer to process sequences
lstm_layer = LSTM(lstm_units)

# Process inputs through embedding and LSTM layers
encoded1 = lstm_layer(embedding_layer(input1))
encoded2 = lstm_layer(embedding_layer(input2))

# Calculate absolute difference between encoded vectors
merged = Lambda(lambda x: abs(x[0] - x[1]))([encoded1, encoded2])

# Predict the probability of paraphrase
preds = Dense(1, activation='sigmoid')(merged)

# Create the Siamese model
siamese_model = Model(inputs=[input1, input2], outputs=preds)
siamese_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Training
print("Training started...")
siamese_model.fit([train_seq1, train_seq2], y_train, epochs=50, batch_size=64, verbose=1)

# Testing
print("Testing started...")
y_pred = siamese_model.predict([test_seq1, test_seq2])

# Convert predicted probabilities to binary predictions
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]

# Evaluate the model
snn_accuracy = round(accuracy_score(y_test, y_pred),4)
print("Siamese NN Accuracy:", snn_accuracy)

KeyboardInterrupt: 

### DistilBERT

In [None]:
# Load the training and test data
train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the data
X_train = train_df[['sentence1', 'sentence2']]
y_train = train_df['label']
X_test = test_df[['sentence1', 'sentence2']]
y_test = test_df['label']

# Define a custom dataset for PyTorch
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, sentences1, sentences2, labels):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence1 = self.sentences1.iloc[idx]
        sentence2 = self.sentences2.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            max_length=128,
            truncation=True
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

# Initialize the tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
distilbert_model.to(device)

# Create DataLoader objects for the training and test datasets with increased batch size
train_dataset = ParaphraseDataset(distilbert_tokenizer, X_train["sentence1"], X_train["sentence2"], y_train)
test_dataset = ParaphraseDataset(distilbert_tokenizer, X_test["sentence1"], X_test["sentence2"], y_test)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Fine-tuning and Training the model
distilbert_model.train()
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)

num_epochs = 10

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {running_loss / len(train_dataloader)}")


# Save the fine-tuned model
model_save_path = "distilbert_model.pth"
# from google.colab import files
# files.download('distilbert_model.pth')
torch.save(distilbert_model.state_dict(), model_save_path)

# Load the fine-tuned model
distilbert_model.load_state_dict(torch.load(model_save_path))
distilbert_model.eval()

# Testing the model
y_true = []
y_pred = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(logits.argmax(1).cpu().numpy())

# Convert predictions from numerical to binary labels (0 or 1)
y_pred = [1 if pred == 1 else 0 for pred in y_pred]

# Compute performance metrics
db_accuracy = round(accuracy_score(y_true, y_pred),4)
print("DistilBERT Accuracy:", db_accuracy)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 15/3588 [00:02<11:10,  5.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:  29%|██▊       | 1027/3588 [03:28<08:26,  5.06it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:  50%|████▉     | 1787/3588 [05:55<06:01,  4.98it/s]Be aware, overflo

Epoch 1 Loss: 0.5498794106291057


Epoch 2:  16%|█▋        | 591/3588 [01:53<09:20,  5.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  20%|█▉        | 701/3588 [02:14<09:01,  5.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  23%|██▎       | 829/3588 [02:39<08:39,  5.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  70%|██████▉   | 2498/3588 [08:00<03:43,  4.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. s

In [None]:
from google.colab import files
files.download('distilbert_model.pth')

### RoBERTa

In [None]:
# # Load the training and test data
# train_df = pd.read_csv('/content/train.tsv', sep='\t')
# val_df = pd.read_csv('/content/validation.tsv', sep='\t')
# test_df = pd.read_csv('/content/test.tsv', sep='\t')

# # Combine the train and the validation
# train_df = pd.concat([train_df, val_df], ignore_index=True)

# # Load pre-trained RoBERTa tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# # Convert sentences to lists
# train_sentences1 = train_df['sentence1'].tolist()
# train_sentences2 = train_df['sentence2'].tolist()
# train_labels = train_df['label'].tolist()

# test_sentences1 = test_df['sentence1'].tolist()
# test_sentences2 = test_df['sentence2'].tolist()
# test_labels = test_df['label'].tolist()

# # Tokenize and encode the sentences
# train_encodings = tokenizer(train_sentences1, train_sentences2, truncation=True, padding=True, return_tensors='pt')
# test_encodings = tokenizer(test_sentences1, test_sentences2, truncation=True, padding=True, return_tensors='pt')

# # Convert labels to tensors
# train_labels_tensor = torch.tensor(train_labels).float().view(-1, 1)
# test_labels_tensor = torch.tensor(test_labels).float().view(-1, 1)

# # Create a TensorDataset
# train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_tensor)
# test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_tensor)

# # Define batch size for DataLoader
# batch_size = 16

# # Create DataLoaders for training and testing
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size)

# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

# # Define the optimizer and loss function
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# loss_fn = torch.nn.BCEWithLogitsLoss()

# # Fine-tune the RoBERTa model
# epochs = 2
# for epoch in range(epochs):
#     model.train()
#     for batch in tqdm(train_loader,desc="Training:"):
#         input_ids, attention_mask, labels = batch

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask)[0]
#         loss = loss_fn(outputs, labels)
#         loss.backward()
#         optimizer.step()
# model_save_path = "roberta_model.pth"
# torch.save(model.state_dict(), model_save_path)

# # Evaluate the fine-tuned model on the test set
# model_load_path = "/content/roberta_model.pth"
# model.load_state_dict(torch.load(model_load_path))
# model.eval()
# test_predictions = []
# with torch.no_grad():
#     for batch in test_loader:
#         input_ids, attention_mask, _ = batch
#         outputs = model(input_ids, attention_mask=attention_mask)[0]
#         predictions = torch.sigmoid(outputs).cpu().numpy()
#         test_predictions.extend(predictions)

# # Convert predictions to binary labels (0 or 1) based on a threshold (0.5)
# threshold = 0.5
# test_predictions = [1 if p >= threshold else 0 for p in test_predictions]

# # Calculate accuracy
# test_labels = [int(label.item()) for _, _, label in test_dataset]
# accuracy = sum([1 for pred, true in zip(test_predictions, test_labels) if pred == true]) / len(test_labels)
# print(f"Test accuracy: {round(accuracy,4)}")

### RoBERTa With GPU

In [None]:
# Load the training and test data
train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

device = torch.device("cuda")

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Load pre-trained RoBERTa tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# Convert sentences to lists
train_sentences1 = train_df['sentence1'].tolist()
train_sentences2 = train_df['sentence2'].tolist()
train_labels = train_df['label'].tolist()

test_sentences1 = test_df['sentence1'].tolist()
test_sentences2 = test_df['sentence2'].tolist()
test_labels = test_df['label'].tolist()

# Tokenize and encode the sentences
train_encodings = roberta_tokenizer(train_sentences1, train_sentences2, truncation=True, padding=True, return_tensors='pt')
test_encodings = roberta_tokenizer(test_sentences1, test_sentences2, truncation=True, padding=True, return_tensors='pt')

# Convert labels to tensors
train_labels_tensor = torch.tensor(train_labels).float().view(-1, 1)
test_labels_tensor = torch.tensor(test_labels).float().view(-1, 1)

train_labels_tensor = train_labels_tensor.to(device)
test_labels_tensor = test_labels_tensor.to(device)

# Create a TensorDataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_tensor)

# Define batch size for DataLoader
batch_size = 16

# Create DataLoaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

roberta_model = roberta_model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(roberta_model.parameters(), lr=1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Fine-tune the RoBERTa model
epochs = 2
for epoch in range(epochs):
    roberta_model.train()
    for batch in tqdm(train_loader, desc="Training:"):
      input_ids, attention_mask, labels = batch
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)
      labels = labels.to(device)

      optimizer.zero_grad()
      outputs = roberta_model(input_ids, attention_mask=attention_mask)[0]
      loss = loss_fn(outputs, labels)
      loss.backward()
      optimizer.step()

model_save_path = "roberta_model.pth"
torch.save(roberta_model.state_dict(), model_save_path)

# Evaluate the fine-tuned model on the test set
roberta_model.load_state_dict(torch.load(model_save_path))
roberta_model.eval()

test_predictions = []
with torch.no_grad():
    for batch in test_loader:
      input_ids, attention_mask, _ = batch
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)
      outputs = roberta_model(input_ids, attention_mask=attention_mask)[0]
      predictions = torch.sigmoid(outputs).cpu().numpy()
      test_predictions.extend(predictions)

# Convert predictions to binary labels (0 or 1) based on a threshold (0.5)
threshold = 0.5
test_predictions = [1 if p >= threshold else 0 for p in test_predictions]

# Calculate accuracy
test_labels = [int(label.item()) for _, _, label in test_dataset]
rb_accuracy = sum([1 for pred, true in zip(test_predictions, test_labels) if pred == true]) / len(test_labels)
print("RoBERTa Accuracy:", round(rb_accuracy,4))

In [None]:
#from google.colab import files
files.download('roberta_model.pth')

In [13]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load the tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load the model architecture
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# Load the saved model weights, ignoring missing keys
model_save_path = "roberta_model.pth"
state_dict = torch.load(model_save_path, map_location=torch.device('cpu'))
roberta_model.load_state_dict(state_dict, strict=False)

# If you're using a GPU, move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_model.to(device)

# Set the model to evaluation mode
roberta_model.eval()


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should pr

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

### Compare Accuracies

In [None]:
accuracies = [lr_accuracy, snn_accuracy, db_accuracy, rb_accuracy]
labels = ['Logistic Regression', 'Siamese NN', 'DistilBERT', 'RoBERTa']

plt.bar(labels, accuracies, color=['#7FB3D5', '#9AC48A', '#F2AE72', '#D092E2'])
plt.xlabel('Accuracies')
plt.ylabel('Accuracy Values')
plt.title('Comparison of Accuracies')
plt.ylim(0, 1)
plt.show()

### Testing on unseen sentences

In [14]:
# List of unseen sentence pairs for paraphrase testing

test_sentences = [
    ("The dog is sitting on the windowsill.", "A dog is perched on the windowsill."), # Paraphrased
    ("She played the piano gracefully.", "Her piano playing was filled with grace."), # Paraphrased
    ("The conference has been postponed due to unforeseen circumstances.", "Due to unexpected events, the conference has been rescheduled."), # Paraphrased
    ("He's not feeling well, so he won't be coming to the party.", "Because he's under the weather, he won't make it to the party."), # Paraphrased
    ("The book was so fascinating that I couldn't put it down.", "The book was incredibly engaging, and I couldn't stop reading it."), # Paraphrased
    ("The sky is blue.", "The grass is green."),  # Non-paraphrased
    ("They went for a walk in the park.", "She took a stroll in the park."),  # Paraphrased
    ("He loves to swim.", "His favorite activity is swimming."),  # Paraphrased
    ("She's a talented artist.", "Her artistic skills are remarkable."),  # Paraphrased
    ("The sun rises in the east.", "The sun sets in the west.")  # Non-paraphrased
]

In [11]:
# test distilbert model on unseen sentences

distilbert_model.eval()
with torch.no_grad():
    for sentence_pair in test_sentences:
        sentence1, sentence2 = sentence_pair
        inputs = distilbert_tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            max_length=128,
            truncation=True
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_pred.extend(logits.argmax(1).cpu().numpy())

predictions = []
predictions = ["Paraphrased" if pred == 1 else "Not Paraphrased" for pred in y_pred]

sentence1_list = []
sentence2_list = []
for i, sentence_pair in enumerate(test_sentences):
    sentence1, sentence2 = sentence_pair
    sentence1_list.append(sentence1)
    sentence2_list.append(sentence2)

df = pd.DataFrame({
    'Sentence 1': sentence1_list,
    'Sentence 2': sentence2_list,
    'Prediction': predictions
})

print(df)

NameError: name 'distilbert_model' is not defined

In [2]:
roberta_model=fi

NameError: name 'roberta_model' is not defined

In [20]:
# test roberta model on unseen sentences

roberta_model.eval()
test_predictions = []
with torch.no_grad():
    for sentence_pair in test_sentences:
        sentence1, sentence2 = sentence_pair
        inputs = roberta_tokenizer(sentence1, sentence2, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        predictions = torch.sigmoid(outputs.logits).cpu().numpy()
        test_predictions.append(predictions)

# Assuming test_predictions is a list of prediction scores
threshold = 0.5  # You can adjust this threshold based on your needs

predictions = []
for pred in test_predictions:
    if pred >= threshold:
        predictions.append("Paraphrased")
    else:
        predictions.append("Not Paraphrased")

sentence1_list = []
sentence2_list = []
for i, sentence_pair in enumerate(test_sentences):
    sentence1, sentence2 = sentence_pair
    sentence1_list.append(sentence1)
    sentence2_list.append(sentence2)

df = pd.DataFrame({
    'Sentence 1': sentence1_list,
    'Sentence 2': sentence2_list,
    'Prediction': predictions
})

print(df)

                                          Sentence 1  \
0              The dog is sitting on the windowsill.   
1                   She played the piano gracefully.   
2  The conference has been postponed due to unfor...   
3  He's not feeling well, so he won't be coming t...   
4  The book was so fascinating that I couldn't pu...   
5                                   The sky is blue.   
6                  They went for a walk in the park.   
7                                  He loves to swim.   
8                           She's a talented artist.   
9                         The sun rises in the east.   

                                          Sentence 2       Prediction  
0                A dog is perched on the windowsill.      Paraphrased  
1           Her piano playing was filled with grace.      Paraphrased  
2  Due to unexpected events, the conference has b...      Paraphrased  
3  Because he's under the weather, he won't make ...      Paraphrased  
4  The book was incredi

In [28]:
def extract_keywords(text):
    # Simple keyword extraction using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def calculate_keyword_similarity(resume, job_description):
    resume_keywords = extract_keywords(resume)
    job_keywords = extract_keywords(job_description)
    common_keywords = resume_keywords.intersection(job_keywords)
    total_keywords = resume_keywords.union(job_keywords)
    return len(common_keywords) / len(total_keywords) if total_keywords else 0

def compare_resume_to_job_description(resume, job_description):
    # Tokenize and encode the texts
    encodings = roberta_tokenizer(resume, job_description, truncation=True, padding=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Model inference
    with torch.no_grad():
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state.mean(dim=1)  # Use the mean of the last hidden state
        similarity_score = cosine_similarity(logits.cpu().numpy())

    # Keyword similarity adjustment
    keyword_similarity = calculate_keyword_similarity(resume, job_description)
    
    # Combine the two similarity scores (adjust weights as needed)
    combined_similarity_score = 0.7 * similarity_score + 0.3 * keyword_similarity
    return combined_similarity_score[0][0]

# Example usage
resume = """Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and implemented Google Cloud Spanner, improving database query response time by 2 times
• Researched and evaluated AWS Sagemaker & Google Cloud Vertex AI for model training and deployment using
CICD improving training time by 20% and fixed underlying issues"""
job_description = """What You’ll Do:

As a key member of the Data team, the Senior Data Engineer will have the autonomy to design, build, and deploy data pipelines using our Databricks platform
Design, build, and maintain data pipelines to ingest batch and streaming data in production environments
Build QA and monitoring tooling on top of the pipelines to minimize bugs
This role offers the opportunity to tackle crucial business problems in a dynamic, fast-paced team, where your ability to deliver with minimal oversight will be highly valued


What You Bring:

5+ years of experience working as a Data Engineer
Prior experience in building ELT data pipelines in the Databricks platform
Experience with: SQL, pySpark, Python
Adhere to simple, maintainable code and cut complexity whenever possible


You’ll Stand Out With:

Prior experience working in a startup environment
Prior experience working in Business Intelligence or Data Analytics
Familiarity with AWS infrastructure and building CICD pipelines
Prior experience with backend API’s using Flask or Django"""

similarity_score = compare_resume_to_job_description(resume, job_description)
print(f"Similarity Score: {similarity_score:.4f}")

AttributeError: 'SequenceClassifierOutput' object has no attribute 'last_hidden_state'

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def extract_keywords(text):
    # Simple keyword extraction using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def calculate_keyword_similarity(resume, job_description):
    resume_keywords = extract_keywords(resume)
    job_keywords = extract_keywords(job_description)
    common_keywords = resume_keywords.intersection(job_keywords)
    total_keywords = resume_keywords.union(job_keywords)
    return len(common_keywords) / len(total_keywords) if total_keywords else 0

def compare_resume_to_job_description(resume, job_description):
    # Tokenize and encode the texts
    encodings = roberta_tokenizer(resume, job_description, truncation=True, padding=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Model inference
    with torch.no_grad():
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        # Since logits are not directly comparable for similarity, we'll use them to get embeddings
        # and then calculate similarity (for sequence classification this step might be different)
        # For simplicity, we'll use the mean of the logits for similarity calculation
        similarity_score = cosine_similarity(logits.cpu().numpy(), logits.cpu().numpy())

    # Keyword similarity adjustment
    keyword_similarity = calculate_keyword_similarity(resume, job_description)
    
    # Combine the two similarity scores (adjust weights as needed)
    combined_similarity_score = 0.7 * similarity_score[0][0] + 0.3 * keyword_similarity
    return combined_similarity_score

# Example usage
resume = """Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and implemented Google Cloud Spanner, improving database query response time by 2 times
• Researched and evaluated AWS Sagemaker & Google Cloud Vertex AI for model training and deployment using
CICD improving training time by 20% and fixed underlying issues"""
job_description = """What You’ll Do:

As a key member of the Data team, the Senior Data Engineer will have the autonomy to design, build, and deploy data pipelines using our Databricks platform
Design, build, and maintain data pipelines to ingest batch and streaming data in production environments
Build QA and monitoring tooling on top of the pipelines to minimize bugs
This role offers the opportunity to tackle crucial business problems in a dynamic, fast-paced team, where your ability to deliver with minimal oversight will be highly valued


What You Bring:

5+ years of experience working as a Data Engineer
Prior experience in building ELT data pipelines in the Databricks platform
Experience with: SQL, pySpark, Python
Adhere to simple, maintainable code and cut complexity whenever possible


You’ll Stand Out With:

Prior experience working in a startup environment
Prior experience working in Business Intelligence or Data Analytics
Familiarity with AWS infrastructure and building CICD pipelines
Prior experience with backend API’s using Flask or Django"""

similarity_score = compare_resume_to_job_description(resume, job_description)
print(f"Similarity Score: {similarity_score:.4f}")

Similarity Score: 0.7104


In [36]:
def extract_keywords(text):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def replace_similar_phrases(resume, job_description):
    job_keywords = extract_keywords(job_description)
    resume_keywords = extract_keywords(resume)

    # Identify phrases in the job description that can replace those in the resume
    updated_resume = resume
    for keyword in job_keywords:
        if keyword in resume_keywords:
            # Find similar phrases in resume and replace them
            pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
            updated_resume = pattern.sub(keyword, updated_resume)

    return updated_resume

def compare_resume_to_job_description(resume, job_description):
    encodings = roberta_tokenizer(resume, job_description, truncation=True, padding=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        similarity_score = cosine_similarity(logits.cpu().numpy(), logits.cpu().numpy())

    keyword_similarity = calculate_keyword_similarity(resume, job_description)
    
    combined_similarity_score = 0.7 * similarity_score[0][0] + 0.3 * keyword_similarity
    return combined_similarity_score

def calculate_keyword_similarity(resume, job_description):
    resume_keywords = extract_keywords(resume)
    job_keywords = extract_keywords(job_description)
    common_keywords = resume_keywords.intersection(job_keywords)
    total_keywords = resume_keywords.union(job_keywords)
    return len(common_keywords) / len(total_keywords) if total_keywords else 0

# Example usage
resume = """Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and implemented Google Cloud Spanner, improving database query response time by 2 times
• Researched and evaluated AWS Sagemaker & Google Cloud Vertex AI for model training and deployment using
CICD improving training time by 20% and fixed underlying issues"""
job_description = """What You’ll Do:

As a key member of the Data team, the Senior Data Engineer will have the autonomy to design, build, and deploy data pipelines using our Databricks platform
Design, build, and maintain data pipelines to ingest batch and streaming data in production environments
Build QA and monitoring tooling on top of the pipelines to minimize bugs
This role offers the opportunity to tackle crucial business problems in a dynamic, fast-paced team, where your ability to deliver with minimal oversight will be highly valued


What You Bring:

5+ years of experience working as a Data Engineer
Prior experience in building ELT data pipelines in the Databricks platform
Experience with: SQL, pySpark, Python
Adhere to simple, maintainable code and cut complexity whenever possible


You’ll Stand Out With:

Prior experience working in a startup environment
Prior experience working in Business Intelligence or Data Analytics
Familiarity with AWS infrastructure and building CICD pipelines
Prior experience with backend API’s using Flask or Django"""

# Update resume
updated_resume = replace_similar_phrases(resume, job_description)

# Calculate similarity score with updated resume
similarity_score = compare_resume_to_job_description(updated_resume, job_description)
print(f"Updated Similarity Score: {similarity_score:.4f}")

Updated Similarity Score: 0.7046


In [37]:
updated_resume = update_resume_for_similarity(resume, job_description)

# Calculate similarity score with updated resume
similarity_score = compare_resume_to_job_description(updated_resume, job_description)
print(f"Updated Similarity Score: {similarity_score:.4f}")


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Keywords missing from resume: {'platform design', 'dynamic', 'intelligence', 'opportunity', 'll stand', 'crucial', 'offers', 'key', 'senior data', 'batch streaming', 'tooling', 'offers opportunity', 'using databricks', 'paced team', 'opportunity tackle', 'python', 'django', 'data analytics', 'environment prior', 'minimal oversight', 'engineer autonomy', 'sql', 'll', 'deploy', 'maintainable code', 'python adhere', 'build qa', 'ability deliver', 'working startup', 'tackle', 'flask django', 'bugs role', 'experience', 'aws infrastructure', 'prior experience', 'deliver', 'build deploy', 'tooling pipelines', 'oversight', 'ability', 'problems', 'senior', 'design build', 'sql pyspark', 'cicd pipelines', 'backend', 'simple', 'building', 'pipelines', 'production', 'production environments', 'experience building', 'design', 'engineer prior', 'team ability', 'monitoring', 'experience working', 'cut', 'experience backend', 'deliver minimal', 'qa', 'problems dynamic', 'role offers', 'environments bu

In [38]:
print(updated_resume)

Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and 

In [42]:
def extract_keywords(text):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

def replace_similar_phrases(resume, job_description):
    job_keywords = extract_keywords(job_description)
    resume_keywords = extract_keywords(resume)

    # Identify phrases in the job description that can replace those in the resume
    updated_resume = resume
    for keyword in job_keywords:
        if keyword in resume_keywords:
            # Find similar phrases in resume and replace them
            pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
            updated_resume = pattern.sub(keyword, updated_resume)

    return updated_resume

def compare_resume_to_job_description(resume, job_description):
    encodings = roberta_tokenizer(resume, job_description, truncation=True, padding=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        similarity_score = cosine_similarity(logits.cpu().numpy(), logits.cpu().numpy())

    keyword_similarity = calculate_keyword_similarity(resume, job_description)
    
    combined_similarity_score = 0.7 * similarity_score[0][0] + 0.3 * keyword_similarity
    return combined_similarity_score

def calculate_keyword_similarity(resume, job_description):
    resume_keywords = extract_keywords(resume)
    job_keywords = extract_keywords(job_description)
    common_keywords = resume_keywords.intersection(job_keywords)
    total_keywords = resume_keywords.union(job_keywords)
    return len(common_keywords) / len(total_keywords) if total_keywords else 0

# Example usage
resume = """Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and implemented Google Cloud Spanner, improving database query response time by 2 times
• Researched and evaluated AWS Sagemaker & Google Cloud Vertex AI for model training and deployment using
CICD improving training time by 20% and fixed underlying issues"""
job_description = """What You’ll Do:

As a key member of the Data team, the Senior Data Engineer will have the autonomy to design, build, and deploy data pipelines using our Databricks platform
Design, build, and maintain data pipelines to ingest batch and streaming data in production environments
Build QA and monitoring tooling on top of the pipelines to minimize bugs
This role offers the opportunity to tackle crucial business problems in a dynamic, fast-paced team, where your ability to deliver with minimal oversight will be highly valued


What You Bring:

5+ years of experience working as a Data Engineer
Prior experience in building ELT data pipelines in the Databricks platform
Experience with: SQL, pySpark, Python
Adhere to simple, maintainable code and cut complexity whenever possible


You’ll Stand Out With:

Prior experience working in a startup environment
Prior experience working in Business Intelligence or Data Analytics
Familiarity with AWS infrastructure and building CICD pipelines
Prior experience with backend API’s using Flask or Django"""

# Update resume
updated_resume = replace_similar_phrases(resume, job_description)

# Calculate similarity score with updated resume
similarity_score = compare_resume_to_job_description(updated_resume, job_description)
print(f"Updated Similarity Score: {similarity_score:.4f}")

Updated Similarity Score: 0.7046


In [40]:
print(updated_resume)

Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an api Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting api products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and 

In [41]:
print(resume)

Developed a custom end to end data ETL ingestion pipeline to transfer 50 million medical images from customer
servers onto a data lake residing in Google Cloud Data Lake
• Integrated the data lake using Google Cloud BigQuery,PostgreSQL and Elasticsearch with data visualization
tools like Tableau and Power BI to generate multiple interactive dashboards
• Created deep learning models using TensorFlow and Keras to detect hemmorages in a set of 1 million images
• Led a cross-functional team of 6 to implement a POC on APIGEE as an API Gateway option and later proposed
APIGEE to be an effective solution to the stakeholders for hosting API products and also to monetize APIs
• Designed a processing job using Hadoop,Spark,Kafka and GCP Dataflow to run through about a 100 million
images in the datalake to make certain changes in the pipeline and automated running the job.
• Analyzed Google Cloud Spanner, Aurora, and MongoDB as potential databases for scalability and performance;
recommended and 