# AI 2024 Online Summer Internship
### Name: Rasikh Ali
### Email: rasikhali1234@gmail.com

<div class="alert alert-block alert-info">
    <h1> Libraries </h1>
</div>

In [1]:
!pip install transformers --quiet
!pip install accelerate --quiet
!pip install bitsandbytes --quiet
!pip install langchain --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.6.1 requires cubinlinker, which is not installed.
cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.6.1 requires ptxcompiler, which is not installed.
cuml 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-cv 0.9.0 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.
cudf 24.6.1 requires cuda-python<12.0a0,>=11.7.1, but you ha

In [2]:
!pip install langchain-community langchain-core --quiet

In [3]:
# General packages
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from textwrap import fill
from IPython.display import Markdown, display # for formating Python display folowing markdown language
import warnings
warnings.filterwarnings('ignore') # avoid warning messages importing packages

In [4]:
# Mistral and LangChain packages (prompt engineering)
import torch
from langchain import PromptTemplate, HuggingFacePipeline
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

2024-08-12 12:16:49.608632: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 12:16:49.608730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 12:16:49.741768: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<div class="alert alert-block alert-info">
    <h1> Loading Model </h1>
</div>

In [5]:
# Model version of Mistral
MODEL_NAME = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Quantization is a technique used to reduce the memory and computation requirements 
# of deep learning models, typically by using fewer bits, 4 bits
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Initialization of a tokenizer for the Mistral-7b model, 
# necessary to preprocess text data for input
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Initialization of the pre-trained language Mistral-7b
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)

# Configuration of some generation-related settings
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 20 # maximum number of new tokens that can be generated by the model
generation_config.temperature = 0.7 # randomness of the generated tex
generation_config.top_p = 0.95 # diversity of the generated text
generation_config.do_sample = True # sampling during the generation process
generation_config.repetition_penalty = 1.15 # the degree to which the model should avoid repeating tokens in the generated text

# A pipeline is an object that works as an API for calling the model
# The pipeline is made of (1) the tokenizer instance, the model instance, and
# some post-procesing settings. Here, it's configured to return full-text outputs
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# HuggingFace pipeline
llm = HuggingFacePipeline(pipeline=pipe)

  warn_deprecated(


<div class="alert alert-block alert-info">
    <h1> Loading Dataset </h1>
</div>

In [9]:
import pandas as pd
import csv
data= pd.read_csv('/kaggle/input/assingment-5-abstract-multi-label-dataset/Test_Sample.csv')
data_t = data.copy()
#Extract text from dataset
test_texts = data_t['ABSTRACT'].tolist()

#Extract actual_values for abstract
test_labels = data_t.drop(columns=['TITLE', 'ID', 'ABSTRACT']).values

<div class="alert alert-block alert-warning">
    <h1> Defining Labels </h1>
</div>

In [7]:
import re
# Define the abstract labels
abstract_labels = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]

<div class="alert alert-block alert-warning">
    <h2> Define function to clean the text </h2>
</div>

In [8]:
def extract_abstract(text):
    """Extracts text outside [INST] and [/INST] tags and checks for abstract labels."""
    last_inst_index = text.rfind("[/INST]")
    if last_inst_index != -1:
        my_text = text[last_inst_index + len("[/INST]") :].lower().strip()
        # Normalize text for easier matching
        response_text = re.sub(r'[\[\]\.\'\"\,]', '', my_text)  # remove certain characters
        response_text = re.sub(r'\s+', ' ', response_text)  # normalize spaces
        detected_abstract = response_text.split(" ")
        return [1 if abstract.lower() in detected_abstract else 0 for abstract in abstract_labels]
    else:
        return [0] * len(abstract_labels)

<div class="alert alert-block alert-info">
    <h1> Testing Phase </h1>
</div>

In [10]:
predictions = []
for text in test_texts:
    #text = "i want to go out  of this house"
    # In English
    query = f"""[INST]You are an expert who needs to assess and detect abstract labels present in the given text.
            Please identify and list up to four prominent abstract labels present in the provided English text.
            Only consider the following abstract labels:

            Computer Science
            Physics
            Mathematics
            Statistics
            Quantitative Biology
            Quantitative Finance

            If no abstracts are detected or the text is unclear, label as: ['neutral'].

            Your answer should be in the form of a Python list containing only the above-mentioned abstract labels.
            Respond with just the labels in list format and nothing else.
            Text: {text}
            Abstract: 
            .[/INST] """

    result = llm(query)
#     display(Markdown(f"<b>{query.removeprefix('[INST]').removesuffix('[/INST]')}</b>"))
#     display(Markdown(f"<p>{result}</p>"))
    r_label = extract_abstract(result)
    print("****", r_label)
    predictions.append(r_label)

  warn_deprecated(


**** [0, 0, 1, 1, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 0, 1, 0, 0, 0]
**** [0, 0, 1, 1, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 1, 1, 1, 0, 0]
**** [0, 0, 1, 1, 0, 0]
**** [0, 0, 1, 1, 0, 0]
**** [0, 1, 1, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 1, 0, 0, 0]
**** [0, 0, 0, 1, 0, 0]


In [11]:
for i in range(20):
    print(test_labels[i],"\n",predictions[i], "\n\n")
#print(test_labels, "\n\n", predictions)

[0 0 1 1 0 0] 
 [0, 0, 1, 1, 0, 0] 


[0 1 1 0 0 0] 
 [0, 1, 1, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 1, 1, 0, 0, 0] 


[0 0 1 0 0 0] 
 [0, 0, 1, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 1, 1, 0, 0] 


[1 0 1 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 1, 1, 0, 0, 0] 


[1 0 1 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 1, 1, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 0 1 0 0 0] 
 [0, 1, 1, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 1, 1, 1, 0, 0] 


[0 0 1 0 0 0] 
 [0, 0, 1, 1, 0, 0] 


[0 0 1 1 0 0] 
 [0, 0, 1, 1, 0, 0] 


[0 1 0 0 0 0] 
 [0, 1, 1, 0, 0, 0] 


[1 0 1 1 0 0] 
 [0, 0, 0, 0, 0, 0] 




<div class="alert alert-block alert-warning">
    <h1> Processing Results </h1>
</div>

In [12]:
# Check the lengths of test_labels and predictions
print(f"Length of test_labels: {len(test_labels)}")
print(f"Length of predictions: {len(predictions)}")

Length of test_labels: 23
Length of predictions: 23


In [13]:
# If lengths are inconsistent, fix them
if len(test_labels) != len(predictions):
    # Align the arrays by trimming the longer array
    min_length = min(len(test_labels), len(predictions))
    test_labels = test_labels[:min_length]
    predictions = predictions[:min_length]
    
# Check the lengths of test_labels and predictions
print(f"Length of test_labels: {len(test_labels)}")
print(f"Length of predictions: {len(predictions)}")

Length of test_labels: 23
Length of predictions: 23


<div class="alert alert-block alert-success">
    <h1> Displaying Accuracy </h1>
</div>

In [14]:
from sklearn.metrics import f1_score, precision_score, recall_score
# Compute evaluation metrics
f1_micro = f1_score(test_labels, predictions, average='micro')
f1_macro = f1_score(test_labels, predictions, average='macro')
precision_micro = precision_score(test_labels, predictions, average='micro')
precision_macro = precision_score(test_labels, predictions, average='macro')
recall_micro = recall_score(test_labels, predictions, average='micro')
recall_macro = recall_score(test_labels, predictions, average='macro')

# Print the evaluation metrics
print(f"F1 Micro: {f1_micro}")
print(f"F1 Macro: {f1_macro}")
print(f"Precision Micro: {precision_micro}")
print(f"Precision Macro: {precision_macro}")
print(f"Recall Micro: {recall_micro}")
print(f"Recall Macro: {recall_macro}")

F1 Micro: 0.43750000000000006
F1 Macro: 0.26795199685225257
Precision Micro: 0.4827586206896552
Precision Macro: 0.2547008547008547
Recall Micro: 0.4
Recall Macro: 0.29047619047619044
