# Kaggle code

In [1]:
! pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp /content/kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

# import

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

# BERT model

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

In [None]:
def train_model(x,y,n_labels):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

  tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
  model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=n_labels)

  # Tokenize and preprocess the text data
  max_length = 128  # You can adjust this based on your dataset and available GPU memory

  X_train_tokenized = tokenizer(list(X_train), padding='max_length', truncation=True, max_length=max_length, return_tensors='pt', return_attention_mask=True)
  X_test_tokenized = tokenizer(list(X_test), padding='max_length', truncation=True, max_length=max_length, return_tensors='pt', return_attention_mask=True)

  y_train = torch.tensor(list(y_train))
  y_test = torch.tensor(list(y_test))

  # Create DataLoader for training and testing data
  batch_size = 16
  train_data = TensorDataset(X_train_tokenized.input_ids, X_train_tokenized.attention_mask, y_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  test_data = TensorDataset(X_test_tokenized.input_ids, X_test_tokenized.attention_mask, y_test)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  # Define optimizer and learning rate scheduler
  optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

  # Fine-tune the model
  num_epochs = 3  # You can adjust the number of training epochs
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  for epoch in range(num_epochs):
      model.train()
      train_loss = 0.0
      for batch in tqdm(train_dataloader, desc="Epoch"):
          input_ids, attention_mask, labels = batch
          input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

          optimizer.zero_grad()
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss
          loss.backward()
          optimizer.step()
          scheduler.step()

          train_loss += loss.item()

      print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss}")

  # Evaluate the model
  model.eval()
  all_predictions = []
  with torch.no_grad():
      for batch in tqdm(test_dataloader, desc="Evaluating"):
          input_ids, attention_mask, labels = batch
          input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

          outputs = model(input_ids, attention_mask=attention_mask)
          logits = outputs.logits
          predictions = torch.argmax(logits, dim=1)
          all_predictions.extend(predictions.tolist())

  # Calculate accuracy and other metrics
  y_test = y_test.tolist()
  accuracy = accuracy_score(y_test, all_predictions)
  report = classification_report(y_test, all_predictions)

  print(f"Accuracy: {accuracy}")
  print(report)

  return [model,tokenizer]

# Tamil

In [None]:
! kaggle datasets download -d sudalairajkumar/tamil-nlp

Downloading tamil-nlp.zip to /content
  0% 0.00/2.93M [00:00<?, ?B/s]
100% 2.93M/2.93M [00:00<00:00, 266MB/s]


In [None]:
! unzip /content/tamil-nlp.zip

Archive:  /content/tamil-nlp.zip
  inflating: tamil_movie_reviews_test.csv  
  inflating: tamil_movie_reviews_train.csv  
  inflating: tamil_news_test.csv     
  inflating: tamil_news_train.csv    
  inflating: tamil_thirukkural_test.csv  
  inflating: tamil_thirukkural_train.csv  




> preprocessing



In [None]:
tamil_data = pd.read_csv('/content/tamildataset/tamil_news_train.csv')
tamil_data.head()

Unnamed: 0,NewsInEnglish,NewsInTamil,Category,CategoryInTamil
0,2017/May/19941-miss-iraq-competition-after-43-...,ஈராக்கில் 43 ஆண்டுகள் கழித்து அழகிப்போட்டி,world,உலகம்
1,2018/Dec/55921-viswasam-s-rustic-folk-song-has...,இந்திய அளவில் ட்ரெண்ட் ஆன அஜித்தின் ‘தள்லே தில...,cinema,சினிமா
2,2018/Dec/55929-inspector-who-bought-insurance-...,சொந்த செலவில் வாகன காப்பீடு எடுத்து கொடுத்த கா...,tamilnadu,தமிழ்நாடு
3,2017/Jan/16161-morarji-desai-budget.txt,பிறந்தநாளில் பட்ஜெட் தாக்கல் செய்தவர்,india,இந்தியா
4,2018/Jun/46476-i-have-not-joined-bjp-or-anyoth...,“நானா? பாஜகவில் இணைந்துவிட்டேனா” - வரலட்சுமி வ...,cinema,சினிமா


In [None]:
tamil_data['CategoryId'] = tamil_data['Category'].factorize()[0]

In [None]:
category = tamil_data[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,Category,CategoryId
0,world,0
1,cinema,1
2,tamilnadu,2
3,india,3
10,politics,4
17,sports,5


In [None]:
text = tamil_data["NewsInTamil"]

In [None]:
category = tamil_data['Category']

In [None]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
tamil_data['NewsInTamil'] = tamil_data['NewsInTamil'].apply(remove_tags)

In [None]:
import re

def remove_special_characters(text):
    # Define a regex pattern to match non-alphanumeric characters (special characters)
    pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
    # Use the sub() function to replace matched special characters with spaces
    text = re.sub(pattern, ' ', text)

    # Remove extra spaces and strip leading/trailing spaces
    text = ' '.join(text.split())

    return text

# Assuming dataset is a DataFrame and 'Text' is a column in the DataFrame
tamil_data['NewsInTamil'] = tamil_data['NewsInTamil'].apply(remove_special_characters)

In [None]:
with open('/content/tamildataset/TamilStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()

    # Filter out stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Join the remaining words back into a string
    cleaned_text = ' '.join(words)

    return cleaned_text
tamil_data['NewsInTamil'] = tamil_data['NewsInTamil'].apply(remove_stopwords)


> Training



In [None]:
df = pd.read_csv('/content/tamildataset/tamil_news_train.csv')
df['CategoryId'] = df['Category'].factorize()[0]

# Assuming your dataset has 'text' and 'category' columns
X = df['NewsInTamil']
y = df['CategoryId']

n_labels=len(df['CategoryId'].unique())

model,tokenizer =  train_model(X,y,n_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch: 100%|██████████| 817/817 [05:02<00:00,  2.70it/s]


Epoch 1/3, Loss: 903.7665749788284


Epoch: 100%|██████████| 817/817 [05:01<00:00,  2.71it/s]


Epoch 2/3, Loss: 588.6352081745863


Epoch: 100%|██████████| 817/817 [05:00<00:00,  2.71it/s]


Epoch 3/3, Loss: 430.68352922797203


Evaluating: 100%|██████████| 91/91 [00:11<00:00,  7.91it/s]

Accuracy: 0.7625602202339986
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       117
           1       0.80      0.90      0.85       186
           2       0.79      0.77      0.78       510
           3       0.73      0.71      0.72       370
           4       0.51      0.59      0.55       111
           5       0.92      0.91      0.92       159

    accuracy                           0.76      1453
   macro avg       0.75      0.76      0.75      1453
weighted avg       0.76      0.76      0.76      1453






In [None]:
output_directory = "tamil_saved_model"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('tamil_saved_model/tokenizer_config.json',
 'tamil_saved_model/special_tokens_map.json',
 'tamil_saved_model/vocab.txt',
 'tamil_saved_model/added_tokens.json')

In [None]:
saved_model_directory = "tamil_saved_model"
tamil_model = BertForSequenceClassification.from_pretrained(saved_model_directory)
tamil_tokenizer = BertTokenizer.from_pretrained(saved_model_directory)



> Testing



In [None]:
with open('/content/tamildataset/TamilStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessing(text):
  remove = re.compile(r'')
  remove_tags = re.sub(remove, '', text)
  pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
  remove_spl_char = re.sub(pattern, ' ', remove_tags)
  remove_spl_char = ' '.join(remove_spl_char.split())
  words = remove_spl_char.split()
  words = [word for word in words if word.lower() not in stopwords]
  cleaned_text = ' '.join(words)
  return cleaned_text

In [None]:
input_text = 'பாண்டியா நிலைமை இதுதான்.. அடுத்த மேட்ச்சில் ஆடுவதில் சிக்கல்.. தவிக்கும் கேப்டன் ரோஹித்'

input = preprocessing(input_text)

input_ids = tamil_tokenizer(input, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model1 = tamil_model.to(device)
input_ids = input_ids.to(device)
with torch.no_grad():
    output = model1(**input_ids)
predictions = output.logits
max_index = torch.argmax(predictions)
target_id = max_index.item()
result = tamil_data.loc[tamil_data['CategoryId'] == target_id, 'Category'].values[0]
result

'sports'

# Telugu

In [None]:
! kaggle datasets download -d sudalairajkumar/telugu-nlp

Downloading telugu-nlp.zip to /content
 95% 84.0M/88.7M [00:00<00:00, 91.6MB/s]
100% 88.7M/88.7M [00:00<00:00, 94.2MB/s]


In [None]:
! unzip /content/telugu-nlp.zip

Archive:  /content/telugu-nlp.zip
  inflating: telugu_books/telugu_books.csv  
  inflating: telugu_news/test_telugu_news.csv  
  inflating: telugu_news/train_telugu_news.csv  




> Preprocessing



In [None]:
telugu_data = pd.read_csv('/content/telugu_dataset/telugu_news/train_telugu_news.csv')
telugu_data.head()

Unnamed: 0,SNo,date,heading,body,topic
0,414,11-05-2017 00:39:13,ఐడిబిఐపై ఆర్‌బిఐ నజర్‌,భారీ ఎత్తున మొండిబకాయిలు పెరిగిపోవడంతో ఐడిబిఐ ...,business
1,2668,12-06-2017 00:40:32,బ్యాంకింగ్‌ చీఫ్‌లతో నేడు జైట్లీ భేటీ,న్యూఢిల్లీ : ఆర్థిక మంత్రి అరుణ్‌ జైట్లీ సోమవా...,business
2,19923,19-01-2017 19:51:31,కీలక వికెట్ తీసిన జడేజా..,కటక్: ఇంగ్లండ్‌తో జరుగుతున్న సెకండ్ వన్డే మ్యా...,sports
3,15762,23-11-2017 17:29:04,మరో రెచ్చగొట్టే చర్యకు దిగిన పాకిస్థాన్,\nఇస్లామాబాద్ : పాకిస్థాన్ అంతర్జాతీయ ఉగ్రవాది...,nation
4,8624,03-04-2017 15:48:23,గోవాలో కొడుకుతో కలిసి అల్లు అర్జున్ స్విమ్మింగ్!,స్టార్‌ హీరోగా వరుస సినిమాలతో బిజీగా ఉన్నప్పటి...,entertainment


In [None]:
telugu_data['topicId'] = telugu_data['topic'].factorize()[0]

In [None]:
telugu_data['heading'] = telugu_data['heading'].astype(str)

In [None]:
telugu_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17312 entries, 0 to 17311
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   SNo      17312 non-null  int64 
 1   date     17312 non-null  object
 2   heading  17312 non-null  object
 3   body     17312 non-null  object
 4   topic    17312 non-null  object
 5   topicId  17312 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 811.6+ KB


In [None]:
topic = telugu_data[['topic', 'topicId']].drop_duplicates().sort_values('topicId')
topic

Unnamed: 0,topic,topicId
0,business,0
2,sports,1
3,nation,2
4,entertainment,3
30,editorial,4


In [None]:
text = telugu_data["heading"]
topic = telugu_data['topic']

In [None]:
import re

def remove_tags(text):
    if isinstance(text, str):
        html_tags_pattern = re.compile(r'<.*?>')
        return re.sub(html_tags_pattern, '', text)
    else:
        return text  # Return non-string values as-is

# Assuming telugu_data is a DataFrame with a column named "heading"
telugu_data["heading"] = telugu_data["heading"].apply(remove_tags)

In [None]:
import re

def remove_special_characters(text):
    # Define a regex pattern to match non-alphanumeric characters (special characters)
    pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
    # Use the sub() function to replace matched special characters with spaces
    text = re.sub(pattern, ' ', str(text))

    # Remove extra spaces and strip leading/trailing spaces
    text = ' '.join(text.split())

    return text

# Assuming dataset is a DataFrame and 'Text' is a column in the DataFrame
telugu_data["heading"] = telugu_data["heading"].apply(remove_special_characters)

In [None]:
# Read the list of stopwords from a text file
with open('/content/telugu_dataset/teluguStopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()

    # Filter out stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Join the remaining words back into a string
    cleaned_text = ' '.join(words)

    return cleaned_text

telugu_data["heading"] = telugu_data["heading"].apply(remove_stopwords)



> Training



In [None]:
n_labels=len(df['topicId'].unique())
n_labels

5

In [None]:
df = pd.read_csv('/content/telugu_dataset/telugu_news/train_telugu_news.csv')
df['topicId'] = df['topic'].factorize()[0]
df['heading'] = df['heading'].astype(str)

# Assuming your dataset has 'text' and 'category' columns
X = df['heading']
y = df['topicId']

n_labels=len(df['topicId'].unique())

model,tokenizer =  train_model(X,y,n_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch: 100%|██████████| 974/974 [06:00<00:00,  2.70it/s]


Epoch 1/3, Loss: 845.1961494833231


Epoch: 100%|██████████| 974/974 [06:01<00:00,  2.69it/s]


Epoch 2/3, Loss: 517.8331381082535


Epoch: 100%|██████████| 974/974 [05:59<00:00,  2.71it/s]


Epoch 3/3, Loss: 362.53299644216895


Evaluating: 100%|██████████| 109/109 [00:13<00:00,  7.93it/s]

Accuracy: 0.8019630484988453
              precision    recall  f1-score   support

           0       0.79      0.75      0.77       255
           1       0.73      0.73      0.73       170
           2       0.81      0.82      0.82       678
           3       0.85      0.88      0.87       538
           4       0.56      0.48      0.52        91

    accuracy                           0.80      1732
   macro avg       0.75      0.73      0.74      1732
weighted avg       0.80      0.80      0.80      1732






In [None]:
output_directory = "telugu_saved_model"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('telugu_saved_model/tokenizer_config.json',
 'telugu_saved_model/special_tokens_map.json',
 'telugu_saved_model/vocab.txt',
 'telugu_saved_model/added_tokens.json')

In [None]:
saved_model_directory = "telugu_saved_model"
telugu_model = BertForSequenceClassification.from_pretrained(saved_model_directory)
telugu_tokenizer = BertTokenizer.from_pretrained(saved_model_directory)



> Testing



In [None]:
with open('/content/telugu_dataset/teluguStopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessing(text):
  remove = re.compile(r'')
  remove_tags = re.sub(remove, '', text)
  pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
  remove_spl_char = re.sub(pattern, ' ', remove_tags)
  remove_spl_char = ' '.join(remove_spl_char.split())
  words = remove_spl_char.split()
  words = [word for word in words if word.lower() not in stopwords]
  cleaned_text = ' '.join(words)
  return cleaned_text

In [None]:
input_text = 'ఆ స్టార్ ప్లేయర్‌కి రెస్ట్ తప్పదా?.. సఫారీలతో మ్యాచ్‌కు ముందు రోహిత్‌‍కు తలనొప్పి'

input = preprocessing(input_text)

input_ids = telugu_tokenizer(input, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model1 = telugu_model.to(device)
input_ids = input_ids.to(device)
with torch.no_grad():
    output = model1(**input_ids)
predictions = output.logits
max_index = torch.argmax(predictions)
print(predictions)
target_id = max_index.item()
result = telugu_data.loc[telugu_data['topicId'] == target_id, 'topic'].values[0]
print(result)

tensor([[-1.3520,  4.2837, -0.7557, -1.0115, -1.7215]], device='cuda:0')
sports


# Hindi

In [25]:
! kaggle datasets download -d vigneshvit/hindi-news-category-dataset

Downloading hindi-news-category-dataset.zip to /content
  0% 0.00/8.50M [00:00<?, ?B/s]
100% 8.50M/8.50M [00:00<00:00, 103MB/s]


In [27]:
! unzip /content/hindi-news-category-dataset.zip

Archive:  /content/hindi-news-category-dataset.zip
  inflating: HINDI DATASET - Sheet2.csv  




> preprocessing



In [28]:
hindi_data = pd.read_csv('/content/hindi_dataset/HINDI DATASET - Sheet2.csv')
hindi_data.head()

Unnamed: 0,Headline,Content,News Categories
0,कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के ...,कांग्रेस नेता बलजिंदर सिंह की सोमवार को पंजाब ...,National
1,केंद्रीय मंत्री बोले- महिला आरक्षण लाने का साह...,केंद्रीय मंत्री प्रह्लाद पटेल ने लोकसभा और विध...,Politics
2,ओपीएस लागू करने से अस्थिर हो सकती है राज्यों क...,आरबीआई के 5 अधिकारियों ने एक लेख में लिखा है क...,Business
3,तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा क...,नामक्कल (तमिलनाडु) में शावरमा खाने से सोमवार क...,National
4,मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे ...,मणिपुर के मुख्यमंत्री एन बीरेन सिंह के आश्वासन...,National


In [29]:
hindi_data['CategoryId'] = hindi_data['News Categories'].factorize()[0]

In [30]:
category = hindi_data[['News Categories', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,News Categories,CategoryId
0,National,0
1,Politics,1
2,Business,2
11,World,3
314,Sports,4
1111,Technology,5
1388,Entertainment,6


In [None]:
text = hindi_data["Headline"]

In [None]:
category = hindi_data['News Categories']

In [None]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
hindi_data['Headline'] = hindi_data['Headline'].apply(remove_tags)

In [None]:
import re

def remove_special_characters(text):
    # Define a regex pattern to match non-alphanumeric characters (special characters)
    pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
    # Use the sub() function to replace matched special characters with spaces
    text = re.sub(pattern, ' ', text)

    # Remove extra spaces and strip leading/trailing spaces
    text = ' '.join(text.split())

    return text

# Assuming dataset is a DataFrame and 'Text' is a column in the DataFrame
hindi_data['Headline'] = hindi_data['Headline'].apply(remove_special_characters)

In [None]:
with open('/content/hindi_dataset/hindiStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()

    # Filter out stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Join the remaining words back into a string
    cleaned_text = ' '.join(words)

    return cleaned_text
hindi_data['Headline'] = hindi_data['Headline'].apply(remove_stopwords)


> Training



In [None]:
df = pd.read_csv('/content/hindi_dataset/HINDI DATASET - Sheet2.csv')
df['CategoryId'] = df['Category'].factorize()[0]

# Assuming your dataset has 'text' and 'category' columns
X = df['Headline']
y = df['CategoryId']

n_labels=len(df['CategoryId'].unique())

model,tokenizer =  hindi_data(X,y,n_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch: 100%|██████████| 817/817 [05:02<00:00,  2.70it/s]


Epoch 1/3, Loss: 903.7665749788284


Epoch: 100%|██████████| 817/817 [05:01<00:00,  2.71it/s]


Epoch 2/3, Loss: 588.6352081745863


Epoch: 100%|██████████| 817/817 [05:00<00:00,  2.71it/s]


Epoch 3/3, Loss: 430.68352922797203


Evaluating: 100%|██████████| 91/91 [00:11<00:00,  7.91it/s]

Accuracy: 0.7625602202339986
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       117
           1       0.80      0.90      0.85       186
           2       0.79      0.77      0.78       510
           3       0.73      0.71      0.72       370
           4       0.51      0.59      0.55       111
           5       0.92      0.91      0.92       159

    accuracy                           0.76      1453
   macro avg       0.75      0.76      0.75      1453
weighted avg       0.76      0.76      0.76      1453






In [None]:
output_directory = "hindi_saved_model"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('tamil_saved_model/tokenizer_config.json',
 'tamil_saved_model/special_tokens_map.json',
 'tamil_saved_model/vocab.txt',
 'tamil_saved_model/added_tokens.json')

In [None]:
saved_model_directory = "hindi_saved_model"
tamil_model = BertForSequenceClassification.from_pretrained(saved_model_directory)
tamil_tokenizer = BertTokenizer.from_pretrained(saved_model_directory)



> Testing



In [None]:
with open('/content/tamildataset/TamilStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessing(text):
  remove = re.compile(r'')
  remove_tags = re.sub(remove, '', text)
  pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
  remove_spl_char = re.sub(pattern, ' ', remove_tags)
  remove_spl_char = ' '.join(remove_spl_char.split())
  words = remove_spl_char.split()
  words = [word for word in words if word.lower() not in stopwords]
  cleaned_text = ' '.join(words)
  return cleaned_text

In [None]:
input_text = 'பாண்டியா நிலைமை இதுதான்.. அடுத்த மேட்ச்சில் ஆடுவதில் சிக்கல்.. தவிக்கும் கேப்டன் ரோஹித்'

input = preprocessing(input_text)

input_ids = tamil_tokenizer(input, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model1 = tamil_model.to(device)
input_ids = input_ids.to(device)
with torch.no_grad():
    output = model1(**input_ids)
predictions = output.logits
max_index = torch.argmax(predictions)
target_id = max_index.item()
result = hindi_data.loc[hindi_data['CategoryId'] == target_id, 'News Categories'].values[0]
result

'sports'

# Malayalam

In [None]:
! kaggle datasets download -d sabinhashmi/malayalam-news-data

Downloading malayalam-news-data.zip to /content
  0% 0.00/358k [00:00<?, ?B/s]
100% 358k/358k [00:00<00:00, 131MB/s]


In [None]:
! unzip /content/malayalam_dataset/malayalam-news-data.zip

Archive:  /content/malayalam_dataset/malayalam-news-data.zip
  inflating: Malayalam News Analytics.xlsx  


In [None]:
excel_file = pd.read_excel('/content/malayalam_dataset/Malayalam News Analytics.xlsx')
excel_file.to_csv('/content/malayalam_dataset/malayalam_news.csv', index=False)

In [None]:
malayalam_data = pd.read_csv('/content/malayalam_dataset/malayalam_news.csv')
malayalam_data.head()

Unnamed: 0,Category,Headline,Article
0,ART & STAGE,ജലച്ചായത്തിന്‍റെ നുറുങ്ങുകള്‍ പകര്‍ന്ന് നല്‍കി...,കൊച്ചി> ജലച്ചായത്തില്‍ ശ്രദ്ധിക്കേണ്ട കാര്യങ്...
1,EDUCATION,നീറ്റ്‌ ; അപേക്ഷ ഇന്നുകൂടി; 15 മുതൽ തെറ്റുതിരു...,"തിരുവനന്തപുരം \n\nരാജ്യത്ത്‌ എംബിബിഎസ്‌, ബിഡി..."
2,CINEMA,ഇതാ ഒറിജിനൽ ആൻഡ്രോയിഡ്‌ കുഞ്ഞപ്പൻ; റോബോട്ടിനുള...,2019 ല്‍ പുറത്തിറങ്ങിയ സിനിമകളില്‍ ഏറ്റവും കൂട...
3,ARTICLES,ഒമാൻ നവോത്ഥാനത്തിന്റെ ശിൽപ്പി,അറബ് ലോകത്തെ ഏറ്റവും പ്രിയങ്കരനായ ഭരണാധികാരിയാ...
4,ART & STAGE,കല്ലിൽ വിരിഞ്ഞ കവിത,ഒരു നൂറ്റാണ്ടിന്റെ ഋതുപ്പച്ചയേറ്റ‌് മലമുകളിൽ ക...


In [None]:
malayalam_data['CategoryId'] = malayalam_data['Category'].factorize()[0]

In [None]:
category = malayalam_data[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,Category,CategoryId
0,ART & STAGE,0
1,EDUCATION,1
2,CINEMA,2
3,ARTICLES,3
5,Kerala,4
7,SPORTS,5
8,PRAVASI,6
10,EDITORIAL,7
14,WORLD,8
17,National,9


In [None]:
text = malayalam_data["Headline"]
category = malayalam_data['Category']

In [None]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
malayalam_data['Headline'] = malayalam_data['Headline'].apply(remove_tags)

In [None]:
import re

def remove_special_characters(text):
    # Define a regex pattern to match non-alphanumeric characters (special characters)
    pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
    # Use the sub() function to replace matched special characters with spaces
    text = re.sub(pattern, ' ', text)

    # Remove extra spaces and strip leading/trailing spaces
    text = ' '.join(text.split())

    return text

# Assuming dataset is a DataFrame and 'Text' is a column in the DataFrame
malayalam_data['Headline'] = malayalam_data['Headline'].apply(remove_special_characters)

In [None]:
with open('/content/malayalam_dataset/malayalamStopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()

    # Filter out stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Join the remaining words back into a string
    cleaned_text = ' '.join(words)

    return cleaned_text
malayalam_data['Headline'] = malayalam_data['Headline'].apply(remove_stopwords)

In [None]:
df = pd.read_csv('/content/malayalam_dataset/malayalam_news.csv')
df['CategoryId'] = df['Category'].factorize()[0]

# Assuming your dataset has 'text' and 'category' columns
X = df['Headline']
y = df['CategoryId']

n_labels=len(df['CategoryId'].unique())

model,tokenizer =  train_model(X,y,n_labels)

In [None]:
output_directory = "malayalam_saved_model"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

In [None]:
saved_model_directory = "malayalam_saved_model"
tamil_model = BertForSequenceClassification.from_pretrained(saved_model_directory)
tamil_tokenizer = BertTokenizer.from_pretrained(saved_model_directory)

In [None]:
with open('/content/tamildataset/TamilStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessing(text):
  remove = re.compile(r'')
  remove_tags = re.sub(remove, '', text)
  pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
  remove_spl_char = re.sub(pattern, ' ', remove_tags)
  remove_spl_char = ' '.join(remove_spl_char.split())
  words = remove_spl_char.split()
  words = [word for word in words if word.lower() not in stopwords]
  cleaned_text = ' '.join(words)
  return cleaned_text

In [None]:
input_text = 'பாண்டியா நிலைமை இதுதான்.. அடுத்த மேட்ச்சில் ஆடுவதில் சிக்கல்.. தவிக்கும் கேப்டன் ரோஹித்'

input = preprocessing(input_text)

input_ids = tamil_tokenizer(input, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model1 = tamil_model.to(device)
input_ids = input_ids.to(device)
with torch.no_grad():
    output = model1(**input_ids)
predictions = output.logits
max_index = torch.argmax(predictions)
target_id = max_index.item()
result = tamil_data.loc[tamil_data['CategoryId'] == target_id, 'Category'].values[0]
result

# Kannada

In [None]:
! kaggle datasets download -d disisbig/kannada-news-dataset

Downloading tamil-nlp.zip to /content
  0% 0.00/2.93M [00:00<?, ?B/s]
100% 2.93M/2.93M [00:00<00:00, 266MB/s]


In [None]:
! unzip /content/kannada_dataset/kannada-news-dataset.zip

Archive:  /content/tamil-nlp.zip
  inflating: tamil_movie_reviews_test.csv  
  inflating: tamil_movie_reviews_train.csv  
  inflating: tamil_news_test.csv     
  inflating: tamil_news_train.csv    
  inflating: tamil_thirukkural_test.csv  
  inflating: tamil_thirukkural_train.csv  




> preprocessing



In [None]:
kannada_data = pd.read_csv('/content/kannada_dataset/train.csv')
kannada_data.head()

Unnamed: 0,NewsInEnglish,NewsInTamil,Category,CategoryInTamil
0,2017/May/19941-miss-iraq-competition-after-43-...,ஈராக்கில் 43 ஆண்டுகள் கழித்து அழகிப்போட்டி,world,உலகம்
1,2018/Dec/55921-viswasam-s-rustic-folk-song-has...,இந்திய அளவில் ட்ரெண்ட் ஆன அஜித்தின் ‘தள்லே தில...,cinema,சினிமா
2,2018/Dec/55929-inspector-who-bought-insurance-...,சொந்த செலவில் வாகன காப்பீடு எடுத்து கொடுத்த கா...,tamilnadu,தமிழ்நாடு
3,2017/Jan/16161-morarji-desai-budget.txt,பிறந்தநாளில் பட்ஜெட் தாக்கல் செய்தவர்,india,இந்தியா
4,2018/Jun/46476-i-have-not-joined-bjp-or-anyoth...,“நானா? பாஜகவில் இணைந்துவிட்டேனா” - வரலட்சுமி வ...,cinema,சினிமா


In [None]:
kannada_data['CategoryId'] = kannada_data['label'].factorize()[0]

In [None]:
category = kannada_data[['label', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,Category,CategoryId
0,world,0
1,cinema,1
2,tamilnadu,2
3,india,3
10,politics,4
17,sports,5


In [None]:
text = tamil_data["headline"]

In [None]:
category = kannada_data['label']

In [None]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
kannada_data['headline'] = kannada_data['headline'].apply(remove_tags)

In [None]:
import re

def remove_special_characters(text):
    # Define a regex pattern to match non-alphanumeric characters (special characters)
    pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
    # Use the sub() function to replace matched special characters with spaces
    text = re.sub(pattern, ' ', text)

    # Remove extra spaces and strip leading/trailing spaces
    text = ' '.join(text.split())

    return text

# Assuming dataset is a DataFrame and 'Text' is a column in the DataFrame
kannada_data['headline'] = kannada_data['headline'].apply(remove_special_characters)

In [None]:
with open('/content/kannada_dataset/kannadaStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()

    # Filter out stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Join the remaining words back into a string
    cleaned_text = ' '.join(words)

    return cleaned_text
kannada_data['headline'] = kannada_data['headline'].apply(remove_stopwords)


> Training



In [None]:
df = pd.read_csv('/content/kannada_dataset/train.csv')
df['CategoryId'] = df['label'].factorize()[0]

# Assuming your dataset has 'text' and 'category' columns
X = df['headline']
y = df['CategoryId']

n_labels=len(df['CategoryId'].unique())

model,tokenizer =  train_model(X,y,n_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch: 100%|██████████| 817/817 [05:02<00:00,  2.70it/s]


Epoch 1/3, Loss: 903.7665749788284


Epoch: 100%|██████████| 817/817 [05:01<00:00,  2.71it/s]


Epoch 2/3, Loss: 588.6352081745863


Epoch: 100%|██████████| 817/817 [05:00<00:00,  2.71it/s]


Epoch 3/3, Loss: 430.68352922797203


Evaluating: 100%|██████████| 91/91 [00:11<00:00,  7.91it/s]

Accuracy: 0.7625602202339986
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       117
           1       0.80      0.90      0.85       186
           2       0.79      0.77      0.78       510
           3       0.73      0.71      0.72       370
           4       0.51      0.59      0.55       111
           5       0.92      0.91      0.92       159

    accuracy                           0.76      1453
   macro avg       0.75      0.76      0.75      1453
weighted avg       0.76      0.76      0.76      1453






In [None]:
output_directory = "kannada_saved_model"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('tamil_saved_model/tokenizer_config.json',
 'tamil_saved_model/special_tokens_map.json',
 'tamil_saved_model/vocab.txt',
 'tamil_saved_model/added_tokens.json')

In [None]:
saved_model_directory = "kannada_saved_model"
tamil_model = BertForSequenceClassification.from_pretrained(saved_model_directory)
tamil_tokenizer = BertTokenizer.from_pretrained(saved_model_directory)



> Testing



In [None]:
with open('/content/kannada_dataset/kannadaStopWords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessing(text):
  remove = re.compile(r'')
  remove_tags = re.sub(remove, '', text)
  pattern =r"[“”‘’!@#$%^&*()_+{}\[\]:;<>,.?/~\\|=-]"
  remove_spl_char = re.sub(pattern, ' ', remove_tags)
  remove_spl_char = ' '.join(remove_spl_char.split())
  words = remove_spl_char.split()
  words = [word for word in words if word.lower() not in stopwords]
  cleaned_text = ' '.join(words)
  return cleaned_text

In [None]:
input_text = 'பாண்டியா நிலைமை இதுதான்.. அடுத்த மேட்ச்சில் ஆடுவதில் சிக்கல்.. தவிக்கும் கேப்டன் ரோஹித்'

input = preprocessing(input_text)

input_ids = tamil_tokenizer(input, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model1 = tamil_model.to(device)
input_ids = input_ids.to(device)
with torch.no_grad():
    output = model1(**input_ids)
predictions = output.logits
max_index = torch.argmax(predictions)
target_id = max_index.item()
result = kannada_data.loc[kannada_data['CategoryId'] == target_id, 'label'].values[0]
result

'sports'