In [None]:
!pip install pandas transformers torch

In [None]:
import pandas as pd
import re
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import pipeline

In [None]:
test_path = 'test.csv'

In [2]:
# Read the CSV with latin1 encoding, skipping bad lines
test = pd.read_csv(test_path, delimiter=',', escapechar='\\', header=0, on_bad_lines='skip', encoding='latin1')

# Function to remove non-utf8 characters from text
def remove_non_utf8(text):
    # Use regex to remove characters that are not valid in utf-8
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Apply the function to the 'text' column to clean non-utf8 characters
test['text'] = test['text'].apply(remove_non_utf8)

In [3]:
# To remove the rows whose Index column is empty
test = test[test['Index'].notnull() & (test['Index'].str.strip() != '')]

In [None]:
test.shape

In [5]:
id2labels = {0: 'academic interests', 1: 'arts and culture', 2: 'automotives', 3: 'books and literature', 4: 'business and finance',
 5: 'careers', 6: 'family and relationships', 7: 'food and drinks', 8: 'health', 9: 'healthy living', 10: 'hobbies and interests',
 11: 'home and garden', 12: 'movies', 13: 'music and audio', 14: 'news and politics', 15: 'personal finance', 16: 'pets',
 17: 'pharmaceuticals, conditions, and symptoms', 18: 'real estate', 19: 'shopping', 20: 'sports', 21: 'style and fashion',
 22: 'technology and computing', 23: 'television', 24: 'travel', 25: 'video gaming'}

In [None]:
import torch
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu) 

In [8]:
torch.cuda.set_per_process_memory_fraction(0.9)

In [None]:
model_path = "model/1model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=gpu)

In [None]:
def remove_patterns(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [None]:
test['text'] = test['text'].apply(remove_patterns)

In [None]:
A = pd.DataFrame(test['text'])
B = pd.DataFrame(test['Index'])

In [None]:
A = A['text'].tolist()

In [None]:
values=nlp(A[:], truncation=True, padding=True)

In [None]:
import torch

# Convert the list of predictions to a tensor
values_tensor = torch.tensor(values)
pred = []
# Loop through each prediction
for i in range(len(A)):
    # Get the index of the maximum value for the current example
    max_index = values_tensor[i].argmax().item()  # .item() to get a Python number from a tensor
    pred.append(id2labels[max_index])
    # Print the predicted label using id2labels

In [None]:
pred_df = pd.DataFrame(pred, columns=["target"])

In [None]:
pred_df['target'] = pred_df['target'].apply(str)

In [None]:
pred_df['Index'] = B['Index']

In [None]:
pred_df.to_csv('Prediction.csv', index=False)