In [1]:
import numpy as np
import pandas as pd
from helpers.clean_data import IndianNewsDataCleaner
from helpers.tokenizer_indian import tokenize_function
from datasets import Dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/Indian_Financial_News.csv", index_col=0)
cleaner = IndianNewsDataCleaner(df, country='India', label = 'Sentiment')
df_clean = (
    cleaner
    .map_sentiment()
    .add_country()
    .clean_text()
    .filter_data()
    .get_clean_data()
)

In [3]:
df.head()

Unnamed: 0,URL,Content,Summary,Sentiment,country
0,https://www.moneycontrol.com/news/business/eco...,US consumer spending dropped by a record in Ap...,consumer spending plunges 13.6 percent in Apri...,0,India
1,https://www.businesstoday.in/top-story/state-r...,State-run lenders require an urgent Rs 1.2 tri...,government will have to take a bulk of the tab...,0,India
2,https://www.financialexpress.com/economy/covid...,Apparel exporters on Wednesday urged the gover...,exporters are facing issues in terms of raw ma...,0,India
3,https://www.moneycontrol.com/news/business/mar...,Asian shares battled to extend a global reboun...,the dollar loses some ground on the safe haven...,0,India
4,https://www.financialexpress.com/industry/six-...,After India’s sovereign credit rating fell to ...,six Indian public-sector undertakings have tak...,0,India


In [4]:
dataset = Dataset.from_pandas(df_clean)
train_df, test_df = train_test_split(df_clean, test_size=0.2, stratify=df_clean['Sentiment'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['Sentiment'], random_state=42)

In [5]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


In [6]:
train_dataset[1]

{'Content': 'The fourth-generation Kia Rio has made its debut internationally with a list of new tech features along with the brand’s latest line-up of engines. Europe will get the updated hatchback later this year, but is it destined for the Indian market? The Kia Rio is currently in its fourth generation, and Kia has now revealed an updated model for 2021. The compact hatchback gets a new line of engines and all the latest technologies that now make it at par with the latest-generation model, its cousin — the Hyundai i20. One of the most exciting changes to the Rio is with the new line f powertrain options which include a version with a mild-hybrid system. There is also a new ‘clutch-by-wire’ semi-automatic transmission on offer which Kia is calling the IMT or Intelligent Manual Transmission. The new Rio is also equipped with Kia’s second-generation UVO Connect internet-enabled features which include telematics and advanced infotainment system. The exterior styling has been updated f

In [7]:

train_tokenized = train_dataset.map(tokenize_function, batched=True)
# val_tokenized = val_dataset.map(tokenize_function, batched=True)
# test_tokenized = test_dataset.map(tokenize_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19411/19411 [06:07<00:00, 52.80 examples/s]


In [3]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
else:
    print("CUDA is NOT available. Running on CPU.")

CUDA is NOT available. Running on CPU.


In [5]:
!nvidia-smi

Sat May 24 20:47:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.61                 Driver Version: 572.61         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   51C    P0             15W /   50W |     241MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                