In [None]:
pip install tf-keras

In [2]:
!pip install --upgrade transformers
import transformers

Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Recovering relevant documents from mixed data

### *Importing Libraries*

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, TFBertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

### *Data*
Loading data files and a list of publicly available english stopwords

In [5]:

healthcare_only = pd.read_csv('/content/drive/MyDrive/Text Classification/Data/healthcare_only.csv')
half_healthcare = pd.read_csv('/content/drive/MyDrive/Text Classification/Data/half_healthcare.csv')

with open('/content/drive/MyDrive/Text Classification/Data/stop_words_english.txt', 'r') as file:
    custom_stopwords = pd.read_csv(file,sep=',',header=None)[0].tolist()



### *Preprocessing*



In [6]:
## Function to remove stopwords
def remove_stopwords(text, custom_stopwords):
    # Split the text into words
    words = text.split()

    # Remove stopwords
    filtered_words = [word for word in words if word not in custom_stopwords]

    # Join the filtered words back into a string
    filtered_text = " ".join(filtered_words)

    return filtered_text

## Apply the remove_stopwords function to healthcare_only data and half_healthcare data

corpus = ' '.join(healthcare_only['text'])
new_corpus = remove_stopwords(corpus, custom_stopwords)
new_row = pd.DataFrame({'text': [new_corpus]})

healthcare = healthcare_only.copy()
healthcare['text'] = healthcare['text'].apply(lambda x: remove_stopwords(x, custom_stopwords))


half_healthcare_cleaned = half_healthcare.copy()  # Create a copy of the original DataFrame
half_healthcare_cleaned['text'] = half_healthcare_cleaned['text'].apply(lambda x: remove_stopwords(x, custom_stopwords))

mod_healthcare = pd.concat([half_healthcare_cleaned, new_row])

### *Healthcare and non-healthcare data detection*
Vectorized the sentences in each row.\
Calculated similarity score between row with healthcare terms and all the rows from half_healthcare.csv\
Set the meadian of similarity score as threshold to separate healthcare and non-healthcare rows

In [7]:
### Text Vectorization using TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_combined = vectorizer.fit_transform(mod_healthcare['text'])

## Finding Similarity score

similarity = tfidf_combined[-1] * tfidf_combined.T
similarity = similarity.A
similarity[0,:-1]

##finding the threshold value

healthcare_threshold = round(np.median(similarity[0,:-1]),3)

## Finding the non_healthcare related text from half_healthcare

non_similar_indices = (similarity[0,:-1] < healthcare_threshold).nonzero()[0]
non_healthcare_rows = half_healthcare.iloc[non_similar_indices]
non_healthcare = non_healthcare_rows.copy()


### *Training data preparation*
To balance the data, only used non_healthcare data (4200 samples idnetified in previous step) and the data from healthcaare_only.csv (4400 samples)

In [8]:
## Labeling and preparing data for training

non_healthcare['label'] = 0 # 0 for non_healthcare
healthcare['label'] = 1 # 1 for healthcare

training_data = pd.concat([healthcare, non_healthcare], ignore_index=True)
training_data

Unnamed: 0,text,doc_id,label
0,step process indispensable collection processi...,c5eb0d4d-cd4f-4a7b-bab6-3b249dbed039,1
1,pain living life 25 chronic pain caused sacroi...,26397052-5dd3-4176-b358-7a4469c6ee9b,1
2,attention details creates warm relaxed environ...,90c6a568-bb5c-4cc2-ac87-c7ee3d95040b,1
3,health wide web resources wahiawa nursing reha...,6b202053-674f-43bd-9090-1ac34f887315,1
4,delaware outpatient center surgeryis specially...,1588c4f1-00ba-473a-82d7-c09ab73564f9,1
...,...,...,...
8676,behind one of the largest animation studios i...,dc9aa2bb-ae1d-4e44-be5e-964515aa2bb2,0
8677,want to expand your expertise interested in th...,25ea7e18-1e06-4769-8b4f-8c0d445afe6e,0
8678,tolmar is fully integrated company focused on ...,e4474746-e8cc-4138-b62f-ba4ec6f43516,0
8679,tour our new coworking office expansion member...,bde19455-0080-4c9c-967b-72f6f536997b,0


In [9]:
## check GPU availability

device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

Found GPU at: /device:GPU:0


### *Model finetuning*
Finetuned the pretrained 'DistilBERT' model by traning for 3 epochs

In [10]:
### training code here

MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
BATCH_SIZE = 16
N_EPOCHS = 3

# Split the data into training and validation sets
train_data,test_data = train_test_split(training_data, test_size=0.2, shuffle = True, random_state=42)

# Load the pretrained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and encode the text data
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True)


train_dataset =  tf.data.Dataset.from_tensor_slices((dict(train_encodings), list(train_data['label'])))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),list(test_data['label'])))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/Text Classification/classification_model', save_best_only = True)

with tf.device(device_name):

  model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
  #chose the optimizer
  optim = tf.keras.optimizers.Adam(learning_rate=5e-5)
  #define the loss function
  losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  #build the model
  model.compile(optimizer=optim,
                loss=losss,
                metrics=['accuracy'])
  # train the model
  model.fit(train_dataset.batch(BATCH_SIZE),
            epochs=N_EPOCHS,
            batch_size=BATCH_SIZE)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


### *Model Evaluation*

In [12]:
#model evaluation on toy data
model.evaluate(test_dataset.batch(BATCH_SIZE),return_dict=True,batch_size=BATCH_SIZE)

#save the model
model.save_pretrained('/content/drive/MyDrive/Text Classification/classification_model')



In [13]:
# Load the saved model
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Text Classification/classification_model')
loaded_model.compile(optimizer=optim, loss=losss, metrics=['accuracy'])
loaded_model.evaluate(test_dataset.batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)

Some layers from the model checkpoint at /content/drive/MyDrive/Text Classification/classification_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Text Classification/classification_model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predict



{'loss': 0.004745154175907373, 'accuracy': 0.9994242787361145}

### *Function for inference/prediction*

In [14]:
def predictions(df):

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = TFDistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Text Classification/classification_model')
    batch_size = 16

    test_encodings = tokenizer(list(df['text']), truncation=True, padding=True)
    test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), list(df['Y']))).batch(batch_size)

    # Make predictions using the trained model
    predicted_logits = model.predict(test_dataset).logits

    predicted_labels = (tf.nn.softmax(predicted_logits, axis=-1)[:, 1] > 0.5).numpy()

    # Create a new dataframe with original text and predicted labels
    result_df = pd.DataFrame({'text': df['text'],'labels': predicted_labels.astype(int)})

    return result_df

In [None]:
!pip freeze > requirements.txt