In [None]:
filepath = 'news.csv'

In [None]:
import pandas as pd
df = pd.read_csv(filepath)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Text combining.
df['NewsText'] = df['title'].fillna('') + " "+df['text'].fillna('')

In [None]:
df.head(3)

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Cleaning Function
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'\S+@\S+', '', text)  # remove emails
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove digits & punctuations
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces

    # Tokenization + Stopword removal + Lemmatization
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

In [None]:
df['NewsContent'] = df['NewsText'].apply(clean_text)

In [None]:
df.head()

In [None]:
# Now dropping the Uncessary Columns
df = df.drop(['Unnamed: 0','title','text','NewsText'],axis=True)

In [None]:
df.head()

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set to evaluation mode

In [None]:
def bert_embeddings(text, tokenizer, model, max_length=128, device='cuda'):
    inputs = tokenizer(text, return_tensors='pt', max_length=max_length,
                       truncation=True, padding=True).to(device)
    
    model.to(device)
    model.eval()  # Makes sure model runs in inference mode

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    return cls_embedding.cpu().numpy()  # Move result back to CPU for numpy

        

In [None]:
embeddings = []
for text in df['NewsContent']:
    emb = bert_embeddings(text, tokenizer, model)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device name:", torch.cuda.get_device_name(0))
print("Torch CUDA version:", torch.version.cuda)
print("Torch version:", torch.__version__)


In [None]:
embeddings

In [None]:
df['label'].value_counts()

In [None]:
x = embeddings
y = df['label'].values

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classes = np.array([0,1])

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import numpy as np


if hasattr(y, 'values'):
    y = y.values

classes = np.unique(y)
print(f"Found {len(classes)} classes: {classes}")

weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
print(f"Class weights: {weights}")

class_weights = torch.tensor(weights, dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)  

In [None]:
y_test

In [None]:
x_train

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
models = {
    "xgbc":XGBClassifier(use_label_encoder=False,eval_metric='logloss'),
    "cbc":CatBoostClassifier(verbose=0),
    "lgbm":LGBMClassifier()
}

param_grid = {
    "xgbc": {
        "n_estimators": [20,50,100,200],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6],
        "subsample": [0.8],  
        "subsample": [0.8],  
    },
    "cbc":{
        "iterations": [20,50,100,200],
        "learning_rate": [0.01, 0.1],
        "depth": [4, 6],
        "l2_leaf_reg": [3, 5],
        "subsample": [0.8]   
    },
    "lgbm":{
        "n_estimators": [20,50,100,200],
        "learning_rate": [0.01, 0.1],
        "num_leaves": [10, 21],
        "max_depth": [5, 7],
        "subsample": [0.8],
        "colsample_bytree": [0.8]  
    }
}

In [None]:
xgb_memory_params = {
    "tree_method": "hist",  
    "grow_policy": "lossguide",  
    "max_bin": 256,  
    "single_precision_histogram": True  
}

# Add to your model definition
models["xgbc"].set_params(**xgb_memory_params)

lgbm_memory_params = {
    "boosting_type": "gbdt",
    "device_type": "cpu",  
    "max_bin": 255,  
    "bin_construct_sample_cnt": 20000  
}

models["lgbm"].set_params(**lgbm_memory_params)

catboost_memory_params = {
    "bootstrap_type": "Bernoulli",  
    "subsample": 0.8,  
    "used_ram_limit": "3gb"  
}

models["cbc"].set_params(**catboost_memory_params)

In [None]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

scorer = make_scorer(f1_score, average='weighted')  

models["xgbc"].set_params(tree_method="hist", grow_policy="lossguide")
models["lgbm"].set_params(verbose=-1, device_type="cpu")  

best_models = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    try:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid[name],
            n_iter=3,  
            scoring=scorer,
            cv=2,  
            random_state=42,
            verbose=1,
            n_jobs=1,   
            error_score='raise'  
        )
        
        search.fit(x_train, y_train)
        best_models[name] = {
            "best_estimator": search.best_estimator_,
            "best_score": search.best_score_,
            "best_params": search.best_params_
        }
        
        print(f"Completed {name} with best score: {search.best_score_:.4f}")
        
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        try:
            print(f"Trying simpler parameters for {name}...")
            simple_param_grid = {k: [v[0]] for k, v in param_grid[name].items()}   
            simple_param_grid['n_estimators'] = [50]   
            
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=simple_param_grid,
                n_iter=1,   
                scoring=scorer,
                cv=2,
                random_state=42,
                verbose=1,
                n_jobs=1
            )
            
            search.fit(x_train, y_train)
            best_models[name] = {
                "best_estimator": search.best_estimator_,
                "best_score": search.best_score_,
                "best_params": search.best_params_
            }
            
            print(f"Completed {name} with simple parameters. Score: {search.best_score_:.4f}")
            
        except Exception as e2:
            print(f"Failed to train {name} even with simple parameters: {str(e2)}")
            best_models[name] = {
                "best_estimator": None,
                "best_score": 0,
                "best_params": {},
                "error": str(e2)
            }

print(f"\n{'='*50}")
print("TRAINING SUMMARY")
print(f"{'='*50}")

for name, result in best_models.items():
    print(f"\n{name}")
    if result["best_estimator"] is not None:
        print(f"Best F1 Score: {result['best_score']:.4f}")
        print(f"Best Params: {result['best_params']}")
    else:
        print(f"Failed to train. Error: {result.get('error', 'Unknown error')}")

In [None]:
lgbm = LGBMClassifier(subsample=0.8,num_leaves=10,n_estimators=100,learning_rate=0.1,colsample_bytree=0.8,class_weight='balanced',random_state=42)

In [None]:
lgbm.fit(x_train,y_train)

In [None]:
lgbm.score(x_test,y_test)

In [None]:
predict = lgbm.predict(x_test)

In [None]:
evaluation = f1_score(predict,y_test)

In [None]:
evaluation

In [None]:
import joblib

try:
    joblib.dump(lgbm, 'lightgbm.pkl')
    print("Model lightgbm.pkl'.")
except Exception as e:
    print(f"Error saving files: {e}")