In [1]:
!pip install GoogleNews

from GoogleNews import GoogleNews
import pandas as pd
import time

# Initialize Google News
googlenews = GoogleNews(lang='en', period='7d')  # News from last 7 days
googlenews.search('Walmart supply chain disruption')

results = googlenews.results()
print(f"Found {len(results)} articles")

# Extract Headlines
label=[]
headlines = []
for article in results:
    headlines.append(article['title'])

# Store in DataFrame
df = pd.DataFrame({'headline': headlines})
df.to_csv('index.csv', index=False)
print(df.head())


Found 10 articles
                                            headline
0  Walmart and the Trump Administration: A New Er...
1  Let them eat junk food: Major organic supplier...
2  Walmart tells Chinese suppliers to resume ship...
3   Amazon and Walmart Battle for the Hybrid Shopper
4  Walmart, Target And Home Depot CEOs Meet With ...


In [2]:
!pip install GoogleNews
from GoogleNews import GoogleNews
import pandas as pd
import time  # To avoid rate limiting

search_terms = [
    "Port closure",
    "Shipping delays",
    "Geopolitical unrest supply chain",
    "Strikes at logistics hubs"
]

all_headlines = []

googlenews = GoogleNews(lang='en', period='7d')  # News from last 7 days

for term in search_terms:
    print(f"Searching for: {term}")
    googlenews.clear()  # Clear previous search results
    googlenews.search(term)
    results = googlenews.results()
    
    for article in results:
        headline = article['title']
        link = article['link']
        all_headlines.append({"search_term": term, "headline": headline })
    
    time.sleep(2)  # Pause to avoid overwhelming servers

# Convert to DataFrame
df = pd.DataFrame(all_headlines)
print(df.head())
df['label'] = ''  # Adds a column with empty strings as placeholder labels
df.to_csv('index.csv', index=False)
print(f"Total headlines collected: {len(df)}")


Searching for: Port closure
Searching for: Shipping delays
Searching for: Geopolitical unrest supply chain
Searching for: Strikes at logistics hubs
    search_term                                           headline
0  Port closure  Kids punching a port-a-potty and complaints ab...
1  Port closure  New Veracruz screwworm case cause border closu...
2  Port closure  CHS Superior Terminal to close according to Po...
3  Port closure  Port Authority: Closure Of CHS Superior Termin...
4  Port closure  Southern Border Livestock Ports Closed After N...
Total headlines collected: 40


In [27]:
'''df = df.dropna(subset=['label'])               # Remove rows with missing labels
df['label'] = df['label'].astype(str)          # Make sure labels are strings
df['label_id'] = encoder.fit_transform(df['label'])'''
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 1. Drop missing labels
df = df.dropna(subset=['label'])

# 2. Strip whitespace and force all labels to strings
df['label'] = df['label'].astype(str).str.strip()

# 3. Filter out any empty labels that may have snuck in
df = df[df['label'] != '']

# 4. Re-encode clean labels
encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])

# (Optional) View mapping
label_map = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label map:", label_map)


Label map: {}


In [29]:

import pandas as pd

# Load your uploaded file
df = pd.read_csv('index.csv')

# Preview
print(df.head())
print(df.columns)  # Check if the label column exists
# Drop missing labels
df = df.dropna(subset=['label'])

# Ensure labels are integers
df['label'] = df['label'].astype(str)
df['headline'] = df['headline'].astype(str)
texts = list(df['headline'])


df.loc[0, 'label'] = 'Port closure'
df.loc[1, 'label'] = 'Shipping delays'
df.loc[3, 'label'] = 'Geopolitical unrest supply chain'
df.loc[2, 'label'] = 'Strikes at logistics hubs'


from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label']) 

    search_term                                           headline  label
0  Port closure  Kids punching a port-a-potty and complaints ab...    NaN
1  Port closure  New Veracruz screwworm case cause border closu...    NaN
2  Port closure  CHS Superior Terminal to close according to Po...    NaN
3  Port closure  Port Authority: Closure Of CHS Superior Termin...    NaN
4  Port closure  Southern Border Livestock Ports Closed After N...    NaN
Index(['search_term', 'headline', 'label'], dtype='object')


In [31]:
import torch
# Convert labels to tensor
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])
labels = torch.tensor(df['label_id'].tolist())
#labels = torch.tensor(df['label'].tolist())

# Show tensor shapes
print("Input IDs shape:", encodings['input_ids'].shape)
print("Labels shape:", labels.shape)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])

Input IDs shape: torch.Size([4, 3])
Labels shape: torch.Size([4])


In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])
label_map = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_map)  # This will show your label-to-ID mapping

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['headline'],
    df['label_id'],  # <-- Use label_id here
    test_size=0.2,
    random_state=42
)


{'Geopolitical unrest supply chain': 0, 'Port closure': 1, 'Shipping delays': 2, 'Strikes at logistics hubs': 3}


In [35]:
from transformers import BertTokenizerFast


# Force all headlines to strings
df['headline'] = df['headline'].astype(str)

# Drop any missing or empty values just in case
df = df.dropna(subset=['headline'])
df = df[df['headline'].str.strip() != '']

# Convert to a list of strings
texts = list(df['headline'])  # Guaranteed to be list[str]

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
encodings = tokenizer(
    list(df['headline']),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)


In [37]:
import torch
from torch.utils.data import Dataset

class HeadlineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        for k, v in item.items():
            print(f"{k}: {v.dtype}")  # Should all be Long
        return item
  
dataset = HeadlineDataset(encodings, df['label_id'].tolist())
remove_unused_columns=False

# Trigger some samples to invoke __getitem__
for i in range(3):
    _ = dataset[i]

input_ids: torch.int64
token_type_ids: torch.int64
attention_mask: torch.int64
labels: torch.int64
input_ids: torch.int64
token_type_ids: torch.int64
attention_mask: torch.int64
labels: torch.int64
input_ids: torch.int64
token_type_ids: torch.int64
attention_mask: torch.int64
labels: torch.int64


  item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])

train_texts, val_texts, train_labels_raw, val_labels_raw = train_test_split(
    df['headline'], df['label_id'], test_size=0.2, random_state=42
)
# Tokenize using previously split text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, return_tensors='pt')

# Use these correct labels
train_labels = torch.tensor(train_labels_raw.tolist(), dtype=torch.long)
val_labels = torch.tensor(val_labels_raw.tolist(), dtype=torch.long)




'''# Tokenize
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, return_tensors='pt')

train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())'''


In [41]:
print(type(train_labels[0]))  # Should be int, not float

<class 'torch.Tensor'>


In [43]:
!pip install transformers torch scikit-learn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df['label_id'] = df['label_id'].astype(int)

# Load your CSV
df = pd.read_csv('index.csv')

# Encode string labels as integers
encoder = LabelEncoder()
df['label_id'] = encoder.fit_transform(df['label'])

# Save the label-to-ID mapping
label_map = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label mapping:", label_map)
# Split data
texts = list(df['headline'])
labels = list(df['label_id'])
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize headlines
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # ✅ This should be Long
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3) # num_labels=len(label_map)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train(resume_from_checkpoint=False)

trainer.evaluate()

model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model')




Label mapping: {nan: 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.0138




('./bert_model\\tokenizer_config.json',
 './bert_model\\special_tokens_map.json',
 './bert_model\\vocab.txt',
 './bert_model\\added_tokens.json',
 './bert_model\\tokenizer.json')

In [58]:

import numpy as np
 # Convert NumPy int64 keys to Python int
label_map = {
    int(k): v
    for k, v in zip(encoder.transform(encoder.classes_), encoder.classes_)
}
'''
np_int = np.int64(42)
np_float = np.float64(3.14)
py_int=int(np_int)
py_float=float(np_float)
# Save to JSON
with open("label_map.json", "w") as f:
    json.dump(label_map, f)

#my_numpy_int = np.int64(42)
my_dict = {int(my_numpy_int): "Label A"}  # ✅ Works fine
import json
label_map = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))  # ID to label
with open("label_map.json", "w") as f:
    json.dump(label_map, f)
    '''

'\nnp_int = np.int64(42)\nnp_float = np.float64(3.14)\npy_int=int(np_int)\npy_float=float(np_float)\n# Save to JSON\nwith open("label_map.json", "w") as f:\n    json.dump(label_map, f)\n\n#my_numpy_int = np.int64(42)\nmy_dict = {int(my_numpy_int): "Label A"}  # ✅ Works fine\nimport json\nlabel_map = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))  # ID to label\nwith open("label_map.json", "w") as f:\n    json.dump(label_map, f)\n    '

In [109]:
pip show transformers accelerate torch

Name: transformers
Version: 4.53.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\Manya verma\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
---
Name: accelerate
Version: 1.8.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: C:\Users\Manya verma\anaconda3\Lib\site-packages
Requires: huggingface_hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
---
Name: torch
Version: 2.7.1
Summary: Tensors and Dynamic neural networks in Py

In [149]:
from torch.utils.data import DataLoader

loader = DataLoader(train_dataset, batch_size=1)
batch = next(iter(loader))

for k, v in batch.items():
    print(f"{k}: {v.dtype}")  # Should all be torch.int64

output = model(**batch)
print(output.logits)  # If this runs, Trainer should work too

input_ids: torch.int64
token_type_ids: torch.int64
attention_mask: torch.int64
labels: torch.int64
tensor([[0.1501]], grad_fn=<AddmmBackward0>)


In [152]:
trainer.train(resume_from_checkpoint=False)



Step,Training Loss
10,0.2067


TrainOutput(global_step=12, training_loss=0.18203331157565117, metrics={'train_runtime': 35.9662, 'train_samples_per_second': 2.669, 'train_steps_per_second': 0.334, 'total_flos': 1282677911424.0, 'train_loss': 0.18203331157565117, 'epoch': 3.0})

outputs = trainer.predict(val_dataset)
logits = outputs.predictions  # Raw scores from the model
import torch

predicted_ids = torch.argmax(torch.tensor(logits), dim=1)
from sklearn.preprocessing import LabelEncoder

predicted_labels = encoder.inverse_transform(predicted_ids.tolist())
true_labels = encoder.inverse_transform(val_labels)
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score: {f1:.4f}")
for i in range(5):
    print(f"Headline: {val_texts[i]}")
    print(f"Predicted Label: {predicted_labels[i]}")
    print(f"True Label: {true_labels[i]}")
    print()