<a href="https://colab.research.google.com/github/Pratik-Nikam/textsummarizer/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from ast import literal_eval
import pandas as pd
import numpy as np

In [7]:
arxiv_data = pd.read_csv(
    "https://github.com/soumik12345/multi-label-text-classification/releases/download/v0.2/arxiv_data.csv"
)
arxiv_data.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [8]:
arxiv_data_filtered = arxiv_data.groupby("terms").filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(49985, 3)

In [9]:
arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
    lambda x: literal_eval(x)
)
arxiv_data_filtered["terms"].values[:5]

array([list(['cs.CV', 'cs.LG']), list(['cs.CV', 'cs.AI', 'cs.LG']),
       list(['cs.CV', 'cs.AI']), list(['cs.CV']),
       list(['cs.CV', 'cs.LG'])], dtype=object)

In [None]:

test_split = 0.1

# Initial train and test split.
train_df, test_df = train_test_split(
    arxiv_data_filtered,
    test_size=test_split,
    stratify=arxiv_data_filtered["terms"].values,
)

# Splitting the test set further into validation
# and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 44986
Number of rows in validation set: 2500
Number of rows in test set: 2499


In [None]:
terms = tf.ragged.constant(train_df["terms"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab = lookup.get_vocabulary()


def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


print("Vocabulary:\n")
print(vocab)

Vocabulary:

['[UNK]', 'cs.CV', 'cs.LG', 'stat.ML', 'cs.AI', 'eess.IV', 'cs.RO', 'cs.CL', 'cs.NE', 'cs.CR', 'cs.SI', 'math.OC', 'eess.SP', 'cs.GR', 'cs.MM', 'cs.IR', 'cs.SY', 'cs.MA', 'cs.HC', 'eess.SY', 'stat.AP', 'math.IT', 'cs.IT', 'cs.DC', 'cs.CY', 'q-bio.QM', 'eess.AS', 'stat.ME', 'stat.TH', 'math.ST', 'cs.SD', 'q-bio.NC', 'cs.DS', 'math.NA', 'cs.CG', 'I.2.6', 'physics.chem-ph', 'cs.NA', 'cs.SE', 'cs.NI', 'cs.GT', 'stat.CO', 'q-bio.BM', '68T45', 'cs.DB', 'physics.comp-ph', 'cs.LO', 'math.PR', 'cs.CE', 'cond-mat.dis-nn', 'cs.PL', 'q-fin.ST', 'physics.data-an', 'cond-mat.stat-mech', 'I.2.10', 'cs.AR', 'I.4.6', '68T05', 'math.DS', 'cs.DM', 'quant-ph', 'cs.PF', '68T07', 'I.2', 'q-bio.GN', 'physics.med-ph', 'physics.geo-ph', 'q-fin.TR', 'physics.soc-ph', 'cond-mat.mtrl-sci', 'math.AT', 'q-bio.TO', 'econ.EM', 'cs.CC', 'I.5.4', 'I.4', 'physics.optics', 'astro-ph.IM', '68T01', 'physics.ao-ph', 'q-fin.CP', 'q-bio.MN', '68U10', 'I.4.8', 'hep-ex', 'cs.SC', '68T10', '62H30', 'q-fin.PM', 'math

In [None]:
sample_label = train_df["terms"].iloc[0]
print(f"Original label: {sample_label}")

label_binarized = lookup([sample_label])
print(f"Label-binarized representation: {label_binarized}")

Original label: ['cs.CV']
Label-binarized representation: [[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
train_df["summaries"].apply(lambda x: len(x.split(" "))).describe()

Unnamed: 0,summaries
count,44986.0
mean,157.776797
std,41.280666
min,5.0
25%,129.0
50%,156.0
75%,184.0
max,462.0


In [None]:
max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE


def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["terms"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summaries"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [None]:
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)


In [None]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    print(" ")

Abstract: b'We introduce a novel learning-based, visibility-aware, surface reconstruction\nmethod for large-scale, defect-laden point clouds. Our approach can cope with\nthe scale and variety of point cloud defects encountered in real-life\nMulti-View Stereo (MVS) acquisitions. Our method relies on a 3D Delaunay\ntetrahedralization whose cells are classified as inside or outside the surface\nby a graph neural network and an energy model solvable with a graph cut. Our\nmodel, making use of both local geometric attributes and line-of-sight\nvisibility information, is able to learn a visibility model from a small amount\nof synthetic training data and generalizes to real-life acquisitions. Combining\nthe efficiency of deep learning methods and the scalability of energy based\nmodels, our approach outperforms both learning and non learning-based\nreconstruction algorithms on two publicly available reconstruction benchmarks.'
Label(s): ['cs.CV' 'cs.CG']
 
Abstract: b'We present a new model 

In [None]:
vocabulary = set()
train_df["summaries"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

157986


In [None]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
validation_dataset = validation_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
test_dataset = test_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)

In [None]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model

In [None]:
epochs = 1

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]
)

history = shallow_mlp_model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs
)


[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m880s[0m 2s/step - binary_accuracy: 0.9730 - loss: 0.0594 - val_binary_accuracy: 0.9986 - val_loss: 0.0051


In [None]:
_, binary_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(binary_acc * 100, 2)}%.")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - binary_accuracy: 0.9985 - loss: 0.0051
Categorical accuracy on the test set: 99.85%.


In [None]:
# Create a model for inference.
model_for_inference = keras.Sequential([text_vectorizer, shallow_mlp_model])

# Create a small dataset just for demoing inference.
inference_dataset = make_dataset(test_df.sample(100), is_train=False)
text_batch, label_batch = next(iter(inference_dataset))
predicted_probabilities = model_for_inference.predict(text_batch)

# Perform inference.
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    predicted_proba = [proba for proba in predicted_probabilities[i]]
    top_3_labels = [
        x
        for _, x in sorted(
            zip(predicted_probabilities[i], lookup.get_vocabulary()),
            key=lambda pair: pair[0],
            reverse=True,
        )
    ][:3]
    print(f"Predicted Label(s): ({', '.join([label for label in top_3_labels])})")
    print(" ")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step
Abstract: b"LIME is a popular approach for explaining a black-box prediction through an\ninterpretable model that is trained on instances in the vicinity of the\npredicted instance. To generate these instances, LIME randomly selects a subset\nof the non-zero features of the predicted instance. After that, the perturbed\ninstances are fed into the black-box model to obtain labels for these, which\nare then used for training the interpretable model. In this study, we present a\nsystematic evaluation of the interpretable models that are output by LIME on\nthe two use-cases that were considered in the original paper introducing the\napproach; text classification and object detection. The investigation shows\nthat the perturbation and labeling phases result in both data and label shift.\nIn addition, we study the correlation between the shift and the fidelity of the\ninterpretable model and show that in certain cases t

In [None]:
# prompt: saving the entire model and reuse on text for lable prediction,  also text vectorization , lables etc

import numpy as np
import tensorflow as tf
from tensorflow import keras

# Save the entire model
model_for_inference.save("arxiv_classification_model")

# Load the saved model
loaded_model = keras.models.load_model("arxiv_classification_model")

# Example usage for prediction
example_text = ["This is an example text about machine learning."]  # Replace with your text

# Preprocess the text (same as done during training)
predicted_probabilities = loaded_model.predict(example_text)


# Assuming you have the vocabulary from the original training as `vocab`
vocab = lookup.get_vocabulary()  # Make sure to load the vocabulary as well!


# Decode predictions to labels
def invert_multi_hot(encoded_labels, vocab):
    hot_indices = np.argwhere(encoded_labels >= 0.5)[..., 0]  # Adjust the threshold as needed
    return np.take(vocab, hot_indices)

predicted_labels = invert_multi_hot(predicted_probabilities[0], vocab)  # Get label predictions for the first example
print("Predicted Labels:", predicted_labels)



In [None]:
# prompt: save entire model, also save text vectorize and vocab ect

import tensorflow as tf
import numpy as np

# ... (Your existing code) ...

# Save the entire model
model_for_inference.save("arxiv_classification_model.keras")

# Save the text vectorizer
text_vectorizer.save("text_vectorizer")

# Save the vocabulary
np.save("vocab.npy", np.array(vocab))

# ... (rest of your code)


# Load the saved model
loaded_model = tf.keras.models.load_model("arxiv_classification_model")

# Load the text vectorizer
loaded_vectorizer = tf.keras.models.load_model("text_vectorizer")


# Load the vocabulary
loaded_vocab = np.load("vocab.npy", allow_pickle=True).tolist()

# Example usage for prediction (now using loaded components)

example_text = ["This is an example text about machine learning."]

# Preprocess using the loaded vectorizer
preprocessed_text = loaded_vectorizer(example_text)

predicted_probabilities = loaded_model.predict(preprocessed_text)

# Decode predictions using the loaded vocabulary
def invert_multi_hot(encoded_labels, vocab):
    hot_indices = np.argwhere(encoded_labels >= 0.5)[..., 0]
    return np.take(vocab, hot_indices)

predicted_labels = invert_multi_hot(predicted_probabilities[0], loaded_vocab)
print("Predicted Labels:", predicted_labels)


ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=arxiv_classification_model.

In [5]:
 df

NameError: name 'df' is not defined

In [10]:
train_df, test_df = train_test_split(arxiv_data_filtered, test_size=0.2, random_state=42, stratify=arxiv_data_filtered["terms"].values)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [11]:
train_df.head()

Unnamed: 0,titles,summaries,terms
13114,A Critical Overview of Privacy-Preserving Appr...,Cooperation between different data owners may ...,"[cs.LG, stat.ML]"
40191,SuperCoder: Program Learning Under Noisy Condi...,We propose a new method of program learning in...,[cs.LG]
33047,Glance and Focus: a Dynamic Approach to Reduci...,The accuracy of deep convolutional neural netw...,"[cs.CV, cs.AI, cs.LG]"
50658,Kernel Change-point Detection with Auxiliary D...,Detecting the emergence of abrupt property cha...,"[stat.ML, cs.LG]"
10313,Hierarchical Multimodal Transformer to Summari...,Although video summarization has achieved trem...,"[cs.CV, cs.AI]"


In [12]:
all_labels = set()
for labels in arxiv_data_filtered["terms"]:
    all_labels.update(labels)
all_labels = list(all_labels)
label_to_id = {label: i for i, label in enumerate(all_labels)}
id_to_label = {i: label for i, label in enumerate(all_labels)}
num_labels = len(all_labels)

In [15]:
all_labels[2]

'G.2.1; G.2.2'

In [16]:
def labels_to_ids(labels):
    return [label_to_id[label] for label in labels]

train_df["label_ids"] = train_df["terms"].apply(labels_to_ids)
val_df["label_ids"] = val_df["terms"].apply(labels_to_ids)
test_df["label_ids"] = test_df["terms"].apply(labels_to_ids)

In [20]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[["summaries", "label_ids"]])
val_dataset = Dataset.from_pandas(val_df[["summaries", "label_ids"]])
test_dataset = Dataset.from_pandas(test_df[["summaries", "label_ids"]])

In [31]:
import torch
def tokenize_function(examples):
    tokenized_summaries = tokenizer(examples["summaries"], padding="max_length", truncation=True, max_length=512)
    labels = []
    for label_ids in examples["label_ids"]:
        binary_label = [0] * num_labels
        for label_id in label_ids:
            binary_label[label_id] = 1
        labels.append(binary_label)
    tokenized_summaries["labels"] = [torch.tensor(label, dtype=torch.float32) for label in labels]
    return tokenized_summaries

In [32]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/39988 [00:00<?, ? examples/s]

Map:   0%|          | 0/4998 [00:00<?, ? examples/s]

Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

In [33]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [36]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

training_args = TrainingArguments(
    output_dir="bert_multi_label",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="logs"
)

def compute_metrics(p):
    pred_labels = np.round(p.predictions)
    true_labels = np.array(p.label_ids, dtype=np.float32).astype(np.float32)
    f1 = f1_score(true_labels, pred_labels, average="micro")
    precision = precision_score(true_labels, pred_labels, average="micro")
    recall = recall_score(true_labels, pred_labels, average="micro")
    return {"f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



In [1]:
trainer.train()

NameError: name 'trainer' is not defined