In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install datasets

In [None]:
from transformers import TFAutoModel,AutoTokenizer
from datasets import load_dataset, Dataset ,DatasetDict
from sklearn.model_selection import train_test_split
import tensorflow as tf


In [None]:
model = TFAutoModel.from_pretrained("bert-base-uncased")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
category = load_dataset("valurank/News_Articles_Categorization")

In [None]:
category

DatasetDict({
    train: Dataset({
        features: ['Text', 'Category'],
        num_rows: 3722
    })
})

In [None]:
label_map = {
    'World': 0,
    'Entertainment': 1,
    'science': 2,
    'Health': 3,
    'Business': 4,
    'Sports': 5,
    'Politics': 6,
    'Tech': 7
}

In [None]:
def convert_labels(example):
    example['Category_int'] = label_map.get(example['Category'], -1)  # -1 for unknown categories
    return example



In [None]:
# Apply the function to the dataset
category = category.map(convert_labels)

Map:   0%|          | 0/3722 [00:00<?, ? examples/s]

In [None]:
category

DatasetDict({
    train: Dataset({
        features: ['Text', 'Category', 'Category_int'],
        num_rows: 3722
    })
})

In [None]:

# prompt: print head of dataset

category['train'].to_pandas().tail()


Unnamed: 0,Text,Category,Category_int
3717,"Credit...University of North CarolinaFeb. 3, 2...",Sports,5
3718,The vice president will preside on Wednesday w...,Politics,6
3719,Credit...Todd Heisler/The New York TimesNov. 1...,Health,3
3720,DealBook|I.P.O.s in U.S. End Weak Year With an...,Business,4
3721,An H.I.V. infection increases the odds of dyin...,Health,3


In [None]:
train_test_split = category['train'].train_test_split(test_size=0.2, seed=42)

# Then, split the test set into validation and test sets
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# Combine these splits into a DatasetDict
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})

In [None]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Category', 'Category_int'],
        num_rows: 2977
    })
    validation: Dataset({
        features: ['Text', 'Category', 'Category_int'],
        num_rows: 372
    })
    test: Dataset({
        features: ['Text', 'Category', 'Category_int'],
        num_rows: 373
    })
})

In [None]:
def tokenize(batch):
    return tokenizer(batch["Text"], padding=True, truncation=True,max_length=512)

In [None]:
category_encoded=split_dataset.map(tokenize, batched=True ,batch_size=None)

Map:   0%|          | 0/2977 [00:00<?, ? examples/s]

Map:   0%|          | 0/372 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

In [None]:
category_encoded

DatasetDict({
    train: Dataset({
        features: ['Text', 'Category', 'Category_int', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2977
    })
    validation: Dataset({
        features: ['Text', 'Category', 'Category_int', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 372
    })
    test: Dataset({
        features: ['Text', 'Category', 'Category_int', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 373
    })
})

In [None]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'Category_int'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

category_encoded.set_format('tf',
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'Category_int'])

def create_tf_dataset(split):
    features = { 'input_ids': split['input_ids'], 'attention_mask': split['attention_mask'], 'token_type_ids': split['token_type_ids'] }
    labels = split['Category_int']
    return tf.data.Dataset.from_tensor_slices((features, labels))

BATCH_SIZE = 8

# Create TensorFlow datasets
train_dataset = create_tf_dataset(category_encoded['train'])
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000).map(lambda x, y: (x, y), num_parallel_calls=tf.data.AUTOTUNE)

test_dataset = create_tf_dataset(category_encoded['test'])
test_dataset = test_dataset.batch(BATCH_SIZE).map(lambda x, y: (x, y), num_parallel_calls=tf.data.AUTOTUNE)








In [None]:
train_dataset

<_ParallelMapDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
class BERTForClassification(tf.keras.Model):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        outputs = self.bert([input_ids, attention_mask, token_type_ids], training=False)
        x = outputs.pooler_output
        return self.fc(x)

classifier = BERTForClassification(model, num_classes=8)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)





In [None]:
history = classifier.fit(train_dataset, epochs=3)
classifier.evaluate(test_dataset)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.1403999775648117, 0.9517426490783691]

In [None]:


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Sample input text
text ="Will NASA’s Sunita Williams run out of food, oxygen till next year? Space agency explains NASA astronaut Suni Williams and Butch Wilmore will make it back from the International Space Station (ISS) in Elon Musk-owned SpaceX’s Crew Dragon spacecraft instead of the faulty Boeing Starliner."
# Tokenize the input text
inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True, max_length=512)

# Make predictions
predictions = classifier(inputs)

# Get the predicted class
predicted_class = tf.argmax(predictions, axis=1).numpy()
print(f'Predicted class: {predicted_class}')


  # 'World': 0,
  #   'Entertainment': 1,
  #   'science': 2,
  #   'Health': 3,
  #   'Business': 4,
  #   'Sports': 5,
  #   'Politics': 6,
  #   'Tech': 7



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



NameError: name 'classifier' is not defined

In [None]:
# Define the serving function with input signatures
@tf.function(input_signature=[{
    'input_ids': tf.TensorSpec(shape=[None, 512], dtype=tf.int32, name='input_ids'),
    'attention_mask': tf.TensorSpec(shape=[None, 512], dtype=tf.int32, name='attention_mask'),
    'token_type_ids': tf.TensorSpec(shape=[None, 512], dtype=tf.int32, name='token_type_ids'),
}])
def serving_fn(inputs):
    return classifier(inputs)

# Save the model using the serving function
model_save_path = '/content/drive/My Drive/bloggy'  # Specify the path in Google Drive
tf.saved_model.save(classifier, model_save_path, signatures={'serving_default': serving_fn})

print(f"Model saved to {model_save_path}")





Model saved to /content/drive/My Drive/bloggy


In [None]:
# (Optional) Verify that the model can be loaded successfully
model_save_path = '/content/drive/My Drive/bloggy'
loaded_model = tf.saved_model.load(model_save_path)
loaded_infer = loaded_model.signatures['serving_default']

# Sample input text for testing


In [None]:
import numpy as np

In [None]:
#text="My dog is sleeping"
#text="Millions of tiny space rock fragments may be on a collision course with Earth and Mars after NASA deliberately crashed a probe into a far-away asteroid two years ago, a new study reveals. The celestial shrapnel, which could start hitting our planet within a decade, poses no risk to life on Earth — but it could trigger the first ever human-caused meteor showers.On Sept. 26, 2022, NASA's Double Asteroid Redirection Test (DART) spacecraft purposefully collided with the asteroid Dimorphos, smashing right into the middle of the space rock at around 15,000 mph (24,000 km/h). The epic impact, which occurred more than 7 million miles (11 million kilometers) from Earth, was the first test of humanity's capability to redirect potentially hazardous asteroids that pose a threat to our planet."
text="This tracker provides the cumulative number of confirmed COVID-19 cases and deaths, as well as the rate of daily COVID-19 cases and deaths by country, income, region, and globally. It will be updated weekly, as new data are released. As of March 7, 2023, all data on COVID-19 cases and deaths are drawn from the World Health Organization’s (WHO) Coronavirus (COVID-19) Dashboard. Prior to March 7, 2023, this tracker relied on data provided by the Johns Hopkins University (JHU) Coronavirus Resource Center’s COVID-19 Map, which ended on March 10, 2023. Please see the Methods tab for more detailed information on data sources and notes. To prevent slow load times, the tracker only contains data from the last 200 days. However, the full data set can be downloaded from our GitHub page."
#text="VideoFujian Jinhua, a new semiconductor maker, is building a chip factory with 100,000 square feet of office space in a region formerly known for manufacturing shoes.CreditCredit...Paul Mozur/The New York TimesJune 22, 2018JINJIANG, China With a dragnet closing in, engineers at a Taiwanese chip maker holding American secrets did their best to conceal a daring case of corporate espionage.As the police raided their offices, human resources workers gave the engineers a warning to scramble and get rid of the evidence. USB drives, laptops and documents were handed to a lower-level employee, who hid them in her locker. Then she walked one engineers phone out the front door.What those devices contained was more valuable than gold or jewels: designs from an American company, Micron Technology, for microchips that have helped power the global digital revolution. According to the Taiwanese authorities, the designs were bound for China, where they would help a new, $5.7 billion microchip factory the size of several airplane hangars rumble into production.China has ambitious plans to overhaul its economy and compete head to head with the United States and other nations in the technology of tomorrow. The heist of the designs two years ago and the raids last year, which were described by Micron in court filings and the police in Taiwan, represent the dark side of that effort and explain in part why the United States is starting a trade war with China.A plan known as Made in China 2025 calls for the country to become a global competitor in an array of industries, including semiconductors, robotics and electric vehicles. China is spending heavily to both innovate and buy up technology from abroad.Politicians in Washington and American companies accuse China of veering into intimidation and outright theft to get there. And they see Micron, an Idaho company whose memory chips give phones and computers the critical ability to store and quickly retrieve information, as a prime example of that aggression.Three years ago, Micron spurned a $23 billion takeover offer from a state-controlled Chinese company. Today it faces a lawsuit and an investigation in China, which accounts for about half its $20 billion in annual sales.Then Micron was the target of the heist in Taiwan, according to officials there and a lawsuit the company has brought against the Taiwanese company that employed the engineers, UMC, and the Chinese company it says wanted access to the technology, Fujian Jinhua Integrated Circuit Company.Other companies may face predicaments similar to Microns, industry experts said.One state-backed factory in the city of Wuhan, owned by Yangtze Memory Technology Company, or YMTC, will be turning out chips that look similar to those made by Samsung, the South Korean chip maker, said Mark Newman, an analyst at Sanford Bernstein.ImageCredit...Tomohiro Ohsumi/BloombergThe YMTC one is virtually identical to Samsungs, which makes it pretty clear theyve been copying, Mr. Newman said.A Samsung spokeswoman declined to comment, and YMTC officials did not return calls for comment. President Xi Jinping of China visited YMTCs production facilities this year, one way Chinas leaders show their endorsement for projects.China defends Made in China 2025 as necessary for its economic survival. It still depends on other countries for crucial goods like chips and software, and China is offering funding for homegrown labs and for entrepreneurs who hope to grab a piece of the future.But Trump administration officials in a report this year recounted how Chinese officials have at times helped local companies get intellectual property from American firms, including in the energy, electronics, software and avionics sectors.American business groups worried about Made in China 2025 point to Micron. The account of its struggles was based on Taiwanese and American legal documents.In 2015, representatives from Tsinghua Unigroup, a Chinese chip maker with major state backing, approached Micron with an acquisition offer, which the company rejected. It later also turned down several partnership offers from Chinese companies out of concern for protecting its technology, said a person with knowledge of the situation, who asked not to be identified because the person lacked authorization to speak publicly.That was when one Chinese company resorted to theft, Micron said in documents filed last December in Federal District Court for the Northern District of California.Microns accusations focus on efforts by Fujian Jinhua Integrated Circuit, a state-backed chip maker, to build a $5.7 billion factory in Chinas Fujian Province. Two years ago, Jinhua tapped UMC, a Taiwanese company, to help it develop technology for the factory. Instead of going through the lengthy steps required to design the technology, Micron said in its suit, UMC and Jinhua decided to steal it.A UMC spokesman denied the allegations and declined to comment further. Jinhua did not respond to requests for comment.First, UMC lured away engineers from Microns Taiwan operations with promises of raises and bonuses, according to the Taiwanese authorities. Then, it asked them to take some of Microns secrets with them, according to Microns court filings and the authorities. The engineers illegally took with them more than 900 files that contained key specifications and details about Microns advanced memory chips, the authorities said.Micron grew suspicious, according to its court documents, after discovering that one of its departing engineers had turned to Google for instructions on how to wipe a company laptop. Later, at a recruiting event in the United States aimed at Micron employees, Jinhua and UMC showed PowerPoint slides that used Microns internal code names when discussing future chips it would make, according to the court documents.ImageCredit...Charlie Litchfield/Associated PressAlerted by Micron, the Taiwanese police tapped the phone of one Micron engineer, Kenny Wang, who was being recruited by UMC. According to an indictment in Taiwan against Mr. Wang and others, UMC reached out to Mr. Wang in early 2016 using Line, the smartphone messaging app, while he was still working for Micron. UMC explained it was having problems developing its memory chip technology. Mr. Wang then grabbed the information it needed from Microns servers, and later used it to help UMCs design. The police said Mr. Wang received a promotion at UMC.When investigators showed up at UMCs offices early last year, the police said, some employees rushed to hide what they had taken from Micron. Mr. Wang and another former Micron employee gave laptops, USB flash drives and documents to an assistant engineer, who locked them in her personal locker. She then left the office with Mr. Wangs phone the one that the police had tapped, which was quickly tracked down.UMC filed its own criminal complaint against Mr. Wang last year, which Taiwanese prosecutors rejected. Mr. Wang and other engineers who were charged said they had taken the trade secrets for personal research. Mr. Wang did not respond to emails and phone calls for comment.In January, Micron was hit with a patent infringement suit by Jinhua and UMC over several types of memory. As part of the suit, the companies requested that the court bar Micron from making and selling the products and pay them damages. The case is being heard by a court in Fujian Province. The Fujian provincial government is an investor in Jinhua.In a letter sent to President Trump, Senators Jim Risch and Michael D. Crapo, Republicans of Idaho, expressed concern about the entire case and specifically the rapid pace with which the patent lawsuit has proceeded. The case could block Micron from selling some products in China.If the case against Micron moves forward, and the Chinese government once again rules in favor of itself, it would cause substantial damage to Micron and the U.S. tech industry as a whole, said the letter, which was viewed by The New York Times.In May, Chinas market regulator opened a price-fixing investigation into Micron, along with the South Korean memory makers SK Hynix and Samsung Electronics. Memory prices have jumped over the past year, because of spiking demand and limited production by the three companies, which dominate the market. Another China regulator, which has said it is also monitoring the price jump, also gave a multimillion-dollar grant to Jinhua.Jinhua and other Chinese chip makers face hurdles in catching up. Production of semiconductors involves a highly complex and automated process that controls everything down to the atomic level.Jinhua and others are spending big to get there. In Jinjiang, a city in Fujian Province once known as a shoe-manufacturing center, Jinhuas new factory is almost finished. Rising five stories and stretching several football fields long, the structure boasts 100,000 square feet of new office space.Economic planners in Jinjiang said they were hoping to attract more talent from Taiwan. In addition to adding more flights there, the town was in the process of building out a bilingual international school, a hospital with international accreditation and upscale apartments. The new plant is just a short drive from the airport.Most of Made in China 2025 is likely to succeed. Not all technologies are rocket science, said Dan Wang, a technology analyst in Beijing with Gavekal Dragonomics, a research firm. With enough subsidies, Chinese firms have a good shot at catching up to the technological frontier."
inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True, max_length=512)

# Prepare inputs for the loaded model
input_dict = {
    'input_ids': tf.cast(inputs['input_ids'], tf.int32),
    'attention_mask': tf.cast(inputs['attention_mask'], tf.int32),
    'token_type_ids': tf.cast(inputs['token_type_ids'], tf.int32),
}


# Get predictions from the loaded model
predictions = loaded_infer(**input_dict)
predicted_class = tf.argmax(predictions['output_0'], axis=1).numpy()
print(f'Predicted class: {predicted_class}')

output_tensor = predictions['output_0']

# Convert the tensor to a NumPy array
output_array = output_tensor.numpy()

# Find the predicted class and confidence
predicted_class = np.argmax(output_array)
confidence = np.max(output_array)

confidence_percentage = confidence * 100

# Print results
print(f"Confidence: {confidence_percentage:.2f}%")

# Print results

  # 'World': 0,
  #   'Entertainment': 1,
  #   'science': 2,
  #   'Health': 3,
  #   'Business': 4,
  #   'Sports': 5,
  #   'Politics': 6,
  #   'Tech': 7




Predicted class: [3]
Confidence: 94.89%


In [None]:
predictions

{'output_0': <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
 array([[1.0442295e-03, 2.6743047e-04, 1.5566358e-03, 1.0518782e-03,
         1.9709799e-03, 3.5410339e-04, 1.2915183e-03, 9.9246323e-01]],
       dtype=float32)>}

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(model_save_path)
converter.optimizations = []
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
tflite_model_path = '/content/drive/My Drive/blogfinal2.tflite'
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)

print(f"TFLite model saved to {tflite_model_path}")

TFLite model saved to /content/drive/My Drive/blogfinal2.tflite


In [None]:
tflite_model_path = '/content/drive/MyDrive/blogfinal2.tflite'  # Replace with the actual path
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
import numpy as np

In [None]:
print("Expected input shapes:")
for detail in input_details:
    print(f"{detail['name']}: {detail['shape']}")

# The model expects input length of 512 (based on previous error)
max_length = 512

# Prepare your input text
text = "NASA has announced that Sunita Williams and Butch Wilmore will return to Earth in February next year. They will make it back from the International Space Station (ISS) in Elon Musk-owned SpaceX’s Crew Dragon spacecraft instead of the faulty Boeing Starliner. However, the mission that was supposed to last just 8 days has been stretched to 8 months now. This brings up several questions."

# Load the tokenizer from the pre-trained BERT model

# Tokenize the input text, ensuring it matches the expected sequence length
input_data = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
input_data



Expected input shapes:
serving_default_attention_mask:0: [  1 512]
serving_default_input_ids:0: [  1 512]
serving_default_token_type_ids:0: [  1 512]


{'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[  101,  9274,  2038,  2623,  2008,  3103,  6590,  3766,  1998,
        17520, 19863,  5974,  2097,  2709,  2000,  3011,  1999,  2337,
         2279,  2095,  1012,  2027,  2097,  2191,  2009,  2067,  2013,
         1996,  2248,  2686,  2276,  1006, 26354,  1007,  1999,  3449,
         2239, 14163,  6711,  1011,  3079,  2686,  2595,  1521,  1055,
         3626,  5202, 12076,  2612,  1997,  1996, 28927, 10321,  2732,
        20660,  1012,  2174,  1010,  1996,  3260,  2008,  2001,  4011,
         2000,  2197,  2074,  1022,  2420,  2038,  2042,  7121,  2000,
         1022,  2706,  2085,  1012,  2023,  7545,  2039,  2195,  3980,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [None]:
# Convert input tensors to int64 and reshape if necessary
input_ids = input_data['input_ids'].astype(np.int32)
attention_mask = input_data['attention_mask'].astype(np.int32)
token_type_ids = input_data['token_type_ids'].astype(np.int32)

# Ensure the tensors match the expected shape
input_ids = np.reshape(input_ids, input_details[0]['shape'])
attention_mask = np.reshape(attention_mask, input_details[1]['shape'])
token_type_ids = np.reshape(token_type_ids, input_details[2]['shape'])

# Debugging: Print the shapes to confirm
print("Input tensor shapes after reshaping:")
print(f"input_ids shape: {input_ids.shape}")
print(f"attention_mask shape: {attention_mask.shape}")
print(f"token_type_ids shape: {token_type_ids.shape}")

# Set the tensor to point to the input data to be inferred
interpreter.set_tensor(input_details[0]['index'], input_ids)
interpreter.set_tensor(input_details[1]['index'], attention_mask)
interpreter.set_tensor(input_details[2]['index'], token_type_ids)

# Run the inference

AttributeError: EagerTensor object has no attribute 'astype'. 
        If you are looking for numpy-related methods, please run the following:
        tf.experimental.numpy.experimental_enable_numpy_behavior()
      

In [None]:
interpreter.invoke()

# Get the predicted class (output tensor)
output_data = interpreter.get_tensor(output_details[0]['index'])

# The output is a probability distribution over the classes, so we take the argmax
predicted_class = np.argmax(output_data, axis=1)
print(f'Predicted class: {predicted_class}')

# Map the predicted class back to the category name
label_map = {
    0: 'World',
    1: 'Entertainment',
    2: 'Science',
    3: 'Health',
    4: 'Business',
    5: 'Sports',
    6: 'Politics',
    7: 'Tech'
}

predicted_category = label_map.get(predicted_class[0], 'Unknown')
print(f'Predicted category: {predicted_category}')