In [22]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the first dataset to get the column labels
first_df = pd.read_excel('Labeled_Data/scraped_articles_business_2_with_sentiment.xlsx')
columns = first_df.columns  # Extract column names

# Load all sheets and concatenate using the columns from the first dataset
sheets = ['Labeled_Data/scraped_articles_business_2_with_sentiment.xlsx', 'Labeled_Data/scraped_articles_business1_with_sentiment.xlsx'
          , 'Labeled_Data/scraped_articles_tech_with_sentiment.xlsx', 'Labeled_Data/scraped_articles_with_sentiment.xlsx']
dfs = [pd.read_excel(sheet, names=columns) for sheet in sheets]  # Use the same column names
df = pd.concat(dfs, ignore_index=True)
print(df.tail())

                                                  Title  \
1006  NZ will be ready to go from first ball: Daryl ...   
1007  India shouldn't send team for CWG: Ex-coach Vi...   
1008  Perth Scorchers sign pacer from Indonesia for ...   
1009  Netflix shuts down AAA game development studio...   
1010  Pep Guardiola comments on Kevin de Bruyne's re...   

                                                   Data   class sentiment  
1006  New Zealand all-rounder Daryl Mitchell said th...  sports  positive  
1007  Ex-India badminton coach Vimal Kumar criticise...  sports  negative  
1008  Perth Scorchers have signed Indonesia seamer N...  sports  positive  
1009  AAA game development studio, Team Blue by Netf...  sports  positive  
1010  Head coach Pep Guardiola spoke at the pre-matc...  sports  positive  


In [23]:
from sklearn.utils import resample
import pandas as pd

positive = df[df['sentiment'] == 'positive']
negative = df[df['sentiment'] == 'negative']
neutral = df[df['sentiment'] == 'neutral']

negative_upsampled = resample(negative, replace=True, n_samples=len(positive), random_state=42)
neutral_upsampled = resample(neutral, replace=True, n_samples=len(positive), random_state=42)

df_upsampled = pd.concat([positive, negative_upsampled, neutral_upsampled])

print(df_upsampled['sentiment'].value_counts())


sentiment
positive    629
negative    629
neutral     629
Name: count, dtype: int64


In [24]:
df_upsampled.tail(10)

Unnamed: 0,Title,Data,class,sentiment
57,Eni to sell 25% stake in biofuel unit to KKR,Italian group will use investment to help fund...,stock business,neutral
449,SpaceX wins $733 million launch contract from ...,Elon Musk-led SpaceX has won a $733 million co...,technology,neutral
483,Elon Musk donates $75 million to pro-Trump gro...,Billionaire Elon Musk donated around $75 milli...,technology,neutral
465,Netflix sees 35% QoQ jump in ads membership; Q...,Netflix has posted 35% quarter-on-quarter jump...,technology,neutral
668,Production at Tata's iPhone plant in Tamil Nad...,Tata Electronics has indefinitely suspended pr...,technology,neutral
889,No more Bazball: Rizwan teases Brook for playi...,Pakistan wicketkeeper Muhammad Rizwan teased E...,sports,neutral
112,Emerging markets are having a moment,US interest rate cuts spur reassessment of ass...,stock business,neutral
10,Politics is distorting economic data,Partisanship continues to pollute results of i...,stock business,neutral
423,Jellysmack laying off employees amid reorganis...,"Jellysmack, a SoftBank-backed creator-economy ...",technology,neutral
685,"Batted at 8 in IPL 2024 to give Jadeja, Dube c...",Discussing his decision of batting at eight in...,sports,neutral


In [25]:
df_upsampled['combined_text'] = df_upsampled['Title'] + ': ' + df_upsampled['Data']

In [26]:
df_upsampled.tail(10)

Unnamed: 0,Title,Data,class,sentiment,combined_text
57,Eni to sell 25% stake in biofuel unit to KKR,Italian group will use investment to help fund...,stock business,neutral,Eni to sell 25% stake in biofuel unit to KKR: ...
449,SpaceX wins $733 million launch contract from ...,Elon Musk-led SpaceX has won a $733 million co...,technology,neutral,SpaceX wins $733 million launch contract from ...
483,Elon Musk donates $75 million to pro-Trump gro...,Billionaire Elon Musk donated around $75 milli...,technology,neutral,Elon Musk donates $75 million to pro-Trump gro...
465,Netflix sees 35% QoQ jump in ads membership; Q...,Netflix has posted 35% quarter-on-quarter jump...,technology,neutral,Netflix sees 35% QoQ jump in ads membership; Q...
668,Production at Tata's iPhone plant in Tamil Nad...,Tata Electronics has indefinitely suspended pr...,technology,neutral,Production at Tata's iPhone plant in Tamil Nad...
889,No more Bazball: Rizwan teases Brook for playi...,Pakistan wicketkeeper Muhammad Rizwan teased E...,sports,neutral,No more Bazball: Rizwan teases Brook for playi...
112,Emerging markets are having a moment,US interest rate cuts spur reassessment of ass...,stock business,neutral,Emerging markets are having a moment: US inter...
10,Politics is distorting economic data,Partisanship continues to pollute results of i...,stock business,neutral,Politics is distorting economic data: Partisan...
423,Jellysmack laying off employees amid reorganis...,"Jellysmack, a SoftBank-backed creator-economy ...",technology,neutral,Jellysmack laying off employees amid reorganis...
685,"Batted at 8 in IPL 2024 to give Jadeja, Dube c...",Discussing his decision of batting at eight in...,sports,neutral,"Batted at 8 in IPL 2024 to give Jadeja, Dube c..."


In [24]:
! pip install transformers tensorflow datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  pid, fd = os.forkpty()




In [27]:
from datasets import Dataset

data = {
    'combined_text': df_upsampled['combined_text'],  # Use your combined text column
    'label': df_upsampled['class']  # Replace with your actual labels column
}

label_mapping = {
    'sports': 0,
    'technology': 1,
    'stock business': 2
    # Add more mappings if you have more classes
}

# Convert the labels
data['label'] = data['label'].map(label_mapping)


# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(pd.DataFrame(data))
dataset = dataset.train_test_split(test_size=0.2)

In [28]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [4]:
! pip install transformers peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.1.0-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
Downloading accelerate-1.1.0-py3-none-any.whl (333 kB)
Installing collected packages: accelerate, peft
Successfully installed accelerate-1.1.0 peft-0.13.2


In [37]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [38]:
def tokenize_function(examples):
    return tokenizer(examples["combined_text"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for TensorFlow
tokenized_train_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'label'])


train_tf_dataset = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=True,
    batch_size=8
)

test_tf_dataset = tokenized_test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['label'],
    shuffle=False,
    batch_size=8
)

Map:   0%|          | 0/1509 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

In [39]:
import tensorflow as tf

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])




In [41]:
model.fit(train_tf_dataset, epochs=3, validation_data=test_tf_dataset)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2f6a58110>

In [42]:
# Evaluate the model
"""loss, accuracy = model.evaluate(test_tf_dataset)
print(f"Test Accuracy: {accuracy}")
"""
# Get predictions
predictions = model.predict(test_tf_dataset)



Test Accuracy: 0.9523809552192688


NameError: name 'np' is not defined

In [43]:
import numpy as np
predicted_labels = np.argmax(predictions.logits, axis=1)

In [47]:
from sklearn.metrics import classification_report, confusion_matrix
# Get true labels from the test dataset
true_labels = np.concatenate([y for _, y in test_tf_dataset], axis=0)

# Print classification report
print(classification_report(true_labels, predicted_labels, target_names=['sports', 'technology', 'stock business']))
print(confusion_matrix(true_labels, predicted_labels))


                precision    recall  f1-score   support

        sports       1.00      1.00      1.00       103
    technology       0.94      0.94      0.94       150
stock business       0.93      0.93      0.93       125

      accuracy                           0.95       378
     macro avg       0.96      0.96      0.96       378
  weighted avg       0.95      0.95      0.95       378

[[103   0   0]
 [  0 141   9]
 [  0   9 116]]


In [53]:
# Save the model and tokenizer
model.save_pretrained('ArticleTag')
tokenizer.save_pretrained('ArticleTag')


('ArticleTag/tokenizer_config.json',
 'ArticleTag/special_tokens_map.json',
 'ArticleTag/vocab.txt',
 'ArticleTag/added_tokens.json',
 'ArticleTag/tokenizer.json')

In [1]:
!pwd

/Users/reetvikchatterjee/Desktop/ArticleTag


In [21]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Layer
from tensorflow.keras.models import Model

# Load the transformer model
model = TFAutoModel.from_pretrained('/Users/reetvikchatterjee/Desktop/ArticleTagModel1/')

# Define a custom layer to handle transformer model call
class TransformerLayer(Layer):
    def __init__(self, transformer_model, **kwargs):
        super().__init__(**kwargs)
        self.transformer_model = transformer_model

    def call(self, inputs):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        transformer_output = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)
        return transformer_output.last_hidden_state

# Define a custom layer for mean pooling
class MeanPoolingLayer(Layer):
    def call(self, inputs):
        return tf.reduce_mean(inputs, axis=1)

# Input layers
input_ids = Input(shape=(None,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(None,), dtype=tf.int32, name="attention_mask")

# Pass inputs through transformer layer
transformer_layer = TransformerLayer(model)
hidden_state = transformer_layer({"input_ids": input_ids, "attention_mask": attention_mask})

# Pooling layer
pooled_output = MeanPoolingLayer()(hidden_state)

# Add a dense layer for classification
output = Dense(3, activation='softmax')(pooled_output)

# Define the full model
classification_model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
classification_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Example usage
text = "I love business"
tokenizer = AutoTokenizer.from_pretrained('/Users/reetvikchatterjee/Desktop/ArticleTagModel1/')
inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)

# Get prediction
predictions = classification_model.predict({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
print(predictions)


Some layers from the model checkpoint at /Users/reetvikchatterjee/Desktop/ArticleTagModel1/ were not used when initializing TFDistilBertModel: ['dropout_79', 'pre_classifier', 'classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at /Users/reetvikchatterjee/Desktop/ArticleTagModel1/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496ms/step
[[0.48139653 0.33398494 0.18461849]]


In [11]:
# Example text input for testing
text = "I love cricket"

# Tokenize the input
inputs = tokenizer(text, return_tensors="tf")

# Get predictions
outputs = model(**inputs)
print(outputs)


TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 5, 768), dtype=float32, numpy=
array([[[-0.5596844 , -0.27533853, -0.32854778, ..., -0.15202664,
          0.28427947,  0.44161305],
        [ 0.03353662,  0.19535089, -0.13418275, ...,  0.00889991,
          0.31571028,  0.46481463],
        [ 0.99417853,  0.40234765,  0.75472903, ..., -0.43293124,
          0.42149183,  0.19452971],
        [-0.26069355, -0.16714583, -0.46940464, ...,  0.57975894,
          0.3092675 , -0.8663386 ],
        [ 0.83990693,  0.3026492 , -0.56095344, ...,  0.37157673,
         -0.03180595, -0.3456197 ]]], dtype=float32)>, hidden_states=None, attentions=None)


In [9]:
! pip install tf-keras



Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Obtaining dependency information for tf-keras from https://files.pythonhosted.org/packages/8a/ed/e08afca471299b04a34cd548e64e89d0153eda0e6cf9b715356777e24774/tf_keras-2.18.0-py3-none-any.whl.metadata
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Obtaining dependency information for keras>=3.5.0 from https://files.pythonhosted.org/packages/c2/88/eef50051a772dcb4433d1f3e4c1d6576ba450fe83e89d028d7e8b85a2122/keras-3.6.0-py3-none-any.whl.metadata
  Using cached keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached keras-3.6.0-py3-none-any.whl (1.2 MB)
Installing collected packages: keras, tf-keras
  Attempting u

In [5]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [36]:
! pip install ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Obtaining dependency information for ipywidgets from https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl.metadata
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Obtaining dependency information for widgetsnbextension~=4.0.12 from https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl.metadata
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Obtaining dependency information for jupyterlab-widgets~=3.0.12 from https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none

In [1]:
from huggingface_hub import login
login()  


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
!pip install transformers peft huggingface_hub



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/reetvikchatterjee/.cache/huggingface/token
Login successful


In [18]:
from huggingface_hub import login

hf_token = '***'
login(hf_token)
# Load PEFT configuration
peft_config = PeftConfig.from_pretrained("YaminiP/llama3.2-finetuned-newsclassify")

# Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B")

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, "YaminiP/llama3.2-finetuned-newsclassify")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/reetvikchatterjee/.cache/huggingface/token
Login successful


OSError: You are trying to access a gated repo.
Make sure to request access at https://huggingface.co/meta-llama/Llama-3.2-1B and pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`.