Import Required Libraries & Load Dataset

In [13]:
import pandas as pd
import numpy as np
import re
import spacy
import nltk
from nltk.corpus import stopwords

# download stopwords if not downloaded
nltk.download('stopwords')

# Load dataset
df = pd.read_csv(
    r"D:\Anomaly_hypertext_news_detection\Dataset-20251117T051821Z-1-001\Dataset\Articles.csv",
    encoding="latin1"
)

df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ssudh_o0coipj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,01-01-2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,01-02-2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,01-05-2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,01-06-2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,01-06-2015,us oil prices slip below 50 a barr,business


Fix Missing Data

In [14]:
# Check missing values
df.isnull().sum()



Article     0
Date        0
Heading     0
NewsType    0
dtype: int64

In [15]:
# Fill missing Articles with empty string
df['Article'] = df['Article'].fillna("")

# Drop rows where both Heading and Article are empty
df = df[~((df['Heading'].isna()) & (df['Article']==""))]


Normalize Text (Lowercase + Remove Noise)

In [16]:
def basic_clean(text):
    text = str(text).lower()                               # lowercase
    text = re.sub(r'\d+', '', text)                       # remove numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)               # remove symbols
    text = re.sub(r'\s+', ' ', text).strip()              # remove extra spaces
    return text

df['clean_basic'] = df['Article'].apply(basic_clean)

In [17]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic
0,KARACHI: The Sindh government has decided to b...,01-01-2015,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...
1,HONG KONG: Asian markets started 2015 on an up...,01-02-2015,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...
2,HONG KONG: Hong Kong shares opened 0.66 perce...,01-05-2015,hong kong stocks open 0.66 percent lower,business,hong kong hong kong shares opened percent lowe...
3,HONG KONG: Asian markets tumbled Tuesday follo...,01-06-2015,asian stocks sink euro near nine year,business,hong kong asian markets tumbled tuesday follow...
4,NEW YORK: US oil prices Monday slipped below $...,01-06-2015,us oil prices slip below 50 a barr,business,new york us oil prices monday slipped below a ...


Remove Stopwords

In [18]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

df['no_stopwords'] = df['clean_basic'].apply(remove_stopwords)


In [19]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords
0,KARACHI: The Sindh government has decided to b...,01-01-2015,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...
1,HONG KONG: Asian markets started 2015 on an up...,01-02-2015,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...
2,HONG KONG: Hong Kong shares opened 0.66 perce...,01-05-2015,hong kong stocks open 0.66 percent lower,business,hong kong hong kong shares opened percent lowe...,hong kong hong kong shares opened percent lowe...
3,HONG KONG: Asian markets tumbled Tuesday follo...,01-06-2015,asian stocks sink euro near nine year,business,hong kong asian markets tumbled tuesday follow...,hong kong asian markets tumbled tuesday follow...
4,NEW YORK: US oil prices Monday slipped below $...,01-06-2015,us oil prices slip below 50 a barr,business,new york us oil prices monday slipped below a ...,new york us oil prices monday slipped barrel f...


Lemmatization (SpaCy NLP)

In [20]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df['lemma_text'] = df['no_stopwords'].apply(lemmatize_text)


Create Text-Based Features

In [21]:
df['word_count'] = df['Article'].apply(lambda x: len(str(x).split()))
df['sentence_count'] = df['Article'].apply(lambda x: x.count('.') + 1)
df['avg_word_length'] = df['Article'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


Detect Numbers Mentioned (Fake news often exaggerates)

In [22]:
df['digit_count'] = df['Article'].apply(lambda x: sum(char.isdigit() for char in x))


Detect Emotional / Exaggeration Words

In [23]:
emotional_words = ["breaking", "shocking", "urgent", "panic", "exclusive", "alert"]

def emotional_score(text):
    return sum(text.lower().count(word) for word in emotional_words)

df['emotion_score'] = df['Article'].apply(emotional_score)


In [24]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,digit_count,emotion_score
0,KARACHI: The Sindh government has decided to b...,01-01-2015,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,3,0
1,HONG KONG: Asian markets started 2015 on an up...,01-02-2015,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...,hong kong asian market start upswe limited tra...,736,94,5.487772,267,0
2,HONG KONG: Hong Kong shares opened 0.66 perce...,01-05-2015,hong kong stocks open 0.66 percent lower,business,hong kong hong kong shares opened percent lowe...,hong kong hong kong shares opened percent lowe...,hong kong hong kong share open percent lower m...,37,6,4.864865,15,0
3,HONG KONG: Asian markets tumbled Tuesday follo...,01-06-2015,asian stocks sink euro near nine year,business,hong kong asian markets tumbled tuesday follow...,hong kong asian markets tumbled tuesday follow...,hong kong asian market tumble tuesday follow p...,523,46,5.141491,109,0
4,NEW YORK: US oil prices Monday slipped below $...,01-06-2015,us oil prices slip below 50 a barr,business,new york us oil prices monday slipped below a ...,new york us oil prices monday slipped barrel f...,new york us oil price monday slip barrel first...,606,40,5.166667,69,0


In [25]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,digit_count,emotion_score
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,3,0
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...,hong kong asian market start upswe limited tra...,736,94,5.487772,267,0
2,HONG KONG: Hong Kong shares opened 0.66 perce...,2015-01-05,hong kong stocks open 0.66 percent lower,business,hong kong hong kong shares opened percent lowe...,hong kong hong kong shares opened percent lowe...,hong kong hong kong share open percent lower m...,37,6,4.864865,15,0
3,HONG KONG: Asian markets tumbled Tuesday follo...,2015-01-06,asian stocks sink euro near nine year,business,hong kong asian markets tumbled tuesday follow...,hong kong asian markets tumbled tuesday follow...,hong kong asian market tumble tuesday follow p...,523,46,5.141491,109,0
4,NEW YORK: US oil prices Monday slipped below $...,2015-01-06,us oil prices slip below 50 a barr,business,new york us oil prices monday slipped below a ...,new york us oil prices monday slipped barrel f...,new york us oil price monday slip barrel first...,606,40,5.166667,69,0


In [26]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayName'] = df['Date'].dt.day_name()          # Monday, Tuesday...
df['MonthName'] = df['Date'].dt.month_name()      # January, February...
df['Week'] = df['Date'].dt.isocalendar().week      # ISO week number
df['Quarter'] = df['Date'].dt.quarter             # Q1, Q2, Q3, Q4


In [27]:
df.head(2)

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,digit_count,emotion_score,Year,Month,Day,DayName,MonthName,Week,Quarter
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,3,0,2015.0,1.0,1.0,Thursday,January,1,1.0
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...,hong kong asian market start upswe limited tra...,736,94,5.487772,267,0,2015.0,1.0,2.0,Friday,January,1,1.0


In [28]:
# Cyclic Encoding (Useful for ML models)

# Dates are cyclical ‚Äî Monday is close to Sunday, and January close to December, but numbers don't reflect that.

import numpy as np

df['Month_sin'] = np.sin(2 * np.pi * df['Month']/12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month']/12)

df['Day_sin'] = np.sin(2 * np.pi * df['Day']/31)
df['Day_cos'] = np.cos(2 * np.pi * df['Day']/31)

df['Week_sin'] = np.sin(2 * np.pi * df['Week']/52)
df['Week_cos'] = np.cos(2 * np.pi * df['Week']/52)


In [29]:
df.head(2)

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,...,DayName,MonthName,Week,Quarter,Month_sin,Month_cos,Day_sin,Day_cos,Week_sin,Week_cos
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,...,Thursday,January,1,1.0,0.5,0.866025,0.201299,0.97953,0.120537,0.992709
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...,hong kong asian market start upswe limited tra...,736,94,5.487772,...,Friday,January,1,1.0,0.5,0.866025,0.394356,0.918958,0.120537,0.992709


In [30]:
# Special Calendar-Based Features

# These help detect events, crisis spikes, or holiday-related news anomalies.
df['Is_Weekend'] = df['DayName'].isin(['Saturday', 'Sunday']).astype(int)
df['Is_Start_of_Month'] = (df['Day'] <= 3).astype(int)
df['Is_End_of_Month'] = (df['Day'] >= 28).astype(int)
df['Is_Quarter_Start'] = df['Month'].isin([1, 4, 7, 10]).astype(int)
df['Is_Quarter_End'] = df['Month'].isin([3, 6, 9, 12]).astype(int)


In [31]:
df.head(1)

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,...,Month_cos,Day_sin,Day_cos,Week_sin,Week_cos,Is_Weekend,Is_Start_of_Month,Is_End_of_Month,Is_Quarter_Start,Is_Quarter_End
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,...,0.866025,0.201299,0.97953,0.120537,0.992709,0,1,0,1,0


In [32]:
# Advanced Temporal Features (Important for Anomaly Detection)
# Days Since Previous Article 
df = df.sort_values(by='Date')
df['Days_Since_Last'] = df['Date'].diff().dt.days
df['Days_Since_Last'] = df['Days_Since_Last'].fillna(0)


In [33]:
df.head(1)

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,...,Day_sin,Day_cos,Week_sin,Week_cos,Is_Weekend,Is_Start_of_Month,Is_End_of_Month,Is_Quarter_Start,Is_Quarter_End,Days_Since_Last
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,...,0.201299,0.97953,0.120537,0.992709,0,1,0,1,0,0.0


In [34]:
# Publication Frequency per Location / Category (future step ‚Äî after extracting location) 
df['Publish_Count_By_Month'] = df.groupby(['Year','Month'])['Article'].transform('count')


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2692 entries, 0 to 2691
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Article                 2692 non-null   object        
 1   Date                    1164 non-null   datetime64[ns]
 2   Heading                 2692 non-null   object        
 3   NewsType                2692 non-null   object        
 4   clean_basic             2692 non-null   object        
 5   no_stopwords            2692 non-null   object        
 6   lemma_text              2692 non-null   object        
 7   word_count              2692 non-null   int64         
 8   sentence_count          2692 non-null   int64         
 9   avg_word_length         2692 non-null   float64       
 10  digit_count             2692 non-null   int64         
 11  emotion_score           2692 non-null   int64         
 12  Year                    1164 non-null   float64      

In [36]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return ', '.join(set(locations)) if locations else None

df["Location"] = df["Article"].apply(extract_location)
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,...,Week_sin,Week_cos,Is_Weekend,Is_Start_of_Month,Is_End_of_Month,Is_Quarter_Start,Is_Quarter_End,Days_Since_Last,Publish_Count_By_Month,Location
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,...,0.120537,0.992709,0,1,0,1,0,0.0,8.0,"Sindh, Karachi"
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,hong kong asian markets started on an upswing ...,hong kong asian markets started upswing limite...,hong kong asian market start upswe limited tra...,736,94,5.487772,...,0.120537,0.992709,0,1,0,1,0,1.0,8.0,"Singapore, Thailand, Jakarta, Philippines, US,..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,2015-01-05,hong kong stocks open 0.66 percent lower,business,hong kong hong kong shares opened percent lowe...,hong kong hong kong shares opened percent lowe...,hong kong hong kong share open percent lower m...,37,6,4.864865,...,0.239316,0.970942,0,0,0,1,0,3.0,8.0,"Hong Kong, HONG KONG"
3,HONG KONG: Asian markets tumbled Tuesday follo...,2015-01-06,asian stocks sink euro near nine year,business,hong kong asian markets tumbled tuesday follow...,hong kong asian markets tumbled tuesday follow...,hong kong asian market tumble tuesday follow p...,523,46,5.141491,...,0.239316,0.970942,0,0,0,1,0,1.0,8.0,"Greece, Athens, Seoul, Milan, Europe, US, Toky..."
4,NEW YORK: US oil prices Monday slipped below $...,2015-01-06,us oil prices slip below 50 a barr,business,new york us oil prices monday slipped below a ...,new york us oil prices monday slipped barrel f...,new york us oil price monday slip barrel first...,606,40,5.166667,...,0.239316,0.970942,0,0,0,1,0,0.0,8.0,"NEW YORK, Saudi Arabia, Iraq, Vienna, Russia, ..."


In [37]:
df.head(1)

Unnamed: 0,Article,Date,Heading,NewsType,clean_basic,no_stopwords,lemma_text,word_count,sentence_count,avg_word_length,...,Week_sin,Week_cos,Is_Weekend,Is_Start_of_Month,Is_End_of_Month,Is_Quarter_Start,Is_Quarter_End,Days_Since_Last,Publish_Count_By_Month,Location
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,karachi the sindh government has decided to br...,karachi sindh government decided bring public ...,karachi sindh government decide bring public t...,106,6,5.433962,...,0.120537,0.992709,0,1,0,1,0,0.0,8.0,"Sindh, Karachi"


In [38]:
import pandas as pd

# Show current columns
print("Before Cleaning:", df.columns.tolist())

# Optional: merge heading + article into a single text field
df["full_text"] = df["Heading"].astype(str) + " " + df["Article"].astype(str)

# Columns to drop
columns_to_drop = [
    "Date", 
    "clean_basic", 
    "no_stopwords", 
    "lemma_text",
    "MonthName",
    "DayName",
    "Hour", "Minute", "Second"
]

# Drop columns if present (avoids errors)
df = df.drop(columns=[c for c in columns_to_drop if c in df.columns], errors="ignore")

print("\nAfter Cleaning:", df.columns.tolist())

# Rearranging columns for final dataset
final_column_order = [
    "full_text", "Heading", "Article", "NewsType",
    "Location",  
    "digit_count", "emotion_score", "Sentiment", "Readability",
    "Year", "Month", "Day", "Quarter", "Week", "Is_Weekend",
    "Month_sin", "Month_cos", "Week_sin", "Week_cos",
    "Days_Since_Last", "Publish_Count_By_Month"
]

df = df[[col for col in final_column_order if col in df.columns]]




Before Cleaning: ['Article', 'Date', 'Heading', 'NewsType', 'clean_basic', 'no_stopwords', 'lemma_text', 'word_count', 'sentence_count', 'avg_word_length', 'digit_count', 'emotion_score', 'Year', 'Month', 'Day', 'DayName', 'MonthName', 'Week', 'Quarter', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'Week_sin', 'Week_cos', 'Is_Weekend', 'Is_Start_of_Month', 'Is_End_of_Month', 'Is_Quarter_Start', 'Is_Quarter_End', 'Days_Since_Last', 'Publish_Count_By_Month', 'Location']

After Cleaning: ['Article', 'Heading', 'NewsType', 'word_count', 'sentence_count', 'avg_word_length', 'digit_count', 'emotion_score', 'Year', 'Month', 'Day', 'Week', 'Quarter', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'Week_sin', 'Week_cos', 'Is_Weekend', 'Is_Start_of_Month', 'Is_End_of_Month', 'Is_Quarter_Start', 'Is_Quarter_End', 'Days_Since_Last', 'Publish_Count_By_Month', 'Location', 'full_text']


In [39]:
# Preview final dataset
df.head(2)

Unnamed: 0,full_text,Heading,Article,NewsType,Location,digit_count,emotion_score,Year,Month,Day,Quarter,Week,Is_Weekend,Month_sin,Month_cos,Week_sin,Week_cos,Days_Since_Last,Publish_Count_By_Month
0,sindh govt decides to cut public transport far...,sindh govt decides to cut public transport far...,KARACHI: The Sindh government has decided to b...,business,"Sindh, Karachi",3,0,2015.0,1.0,1.0,1.0,1,0,0.5,0.866025,0.120537,0.992709,0.0,8.0
1,asia stocks up in new year trad HONG KONG: Asi...,asia stocks up in new year trad,HONG KONG: Asian markets started 2015 on an up...,business,"Singapore, Thailand, Jakarta, Philippines, US,...",267,0,2015.0,1.0,2.0,1.0,1,0,0.5,0.866025,0.120537,0.992709,1.0,8.0


In [40]:
import numpy as np
print(np.__version__)


1.26.4


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['full_text'])

df['top_keywords'] = df['full_text'].apply(
    lambda x: ", ".join(sorted(tfidf.vocabulary_, key=lambda w: tfidf.vocabulary_[w])[:5])
)


In [46]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embedding = model.encode(["Hello world"])

print("Embedding length:", len(embedding[0]))


Embedding length: 384


In [51]:
import os

sub_path = r"D:\Anomaly_hypertext_news_detection\Dataset-20251117T051821Z-1-001\Dataset"
print(os.listdir(sub_path))


['Articles.csv', 'Articles.xlsx']


In [54]:
df.head(1)

Unnamed: 0,Article,Date,Heading,NewsType,embedding,cluster
0,KARACHI: The Sindh government has decided to b...,01-01-2015,sindh govt decides to cut public transport far...,business,"[0.015569633804261684, 0.05348445102572441, 0....",6


In [57]:
# import os
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from sklearn.cluster import KMeans
# from transformers import pipeline
# import spacy
# import tqdm

# # -------------------------------
# # Assume df already exists
# # -------------------------------
# text_column = "Article" if "Article" in df.columns else df.columns[0]

# # -------------------------------
# # 1Ô∏è‚É£ Embeddings (already done? skip)
# # -------------------------------
# if "embedding" not in df.columns:
#     print("\nüîπ Generating Embeddings...")
#     model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

#     texts = df[text_column].fillna("").tolist()

#     embeddings = model.encode(
#         texts,
#         batch_size=32,                # Increased batch = faster
#         show_progress_bar=True,
#         convert_to_numpy=True
#     )

#     df["embedding"] = embeddings.tolist()

# # -------------------------------
# # 2Ô∏è‚É£ Clustering (already done? skip)
# # -------------------------------
# if "cluster" not in df.columns:
#     print("\nüîπ Clustering articles...")
#     NUM_CLUSTERS = 8
#     kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
#     df["cluster"] = kmeans.fit_predict(df["embedding"].tolist())

# # -------------------------------
# # 3Ô∏è‚É£ FAST Zero-Shot Topic Labeling (Batch Mode)
# # -------------------------------
# print("\n‚ö° Assigning topic labels (Optimized Mode)...")

# classifier = pipeline(
#     "zero-shot-classification",
#     model="facebook/bart-large-mnli",   # Much faster and accurate
#     device="cpu"
# )

# candidate_labels = [
#     "politics", "sports", "technology", "crime",
#     "business", "entertainment", "health", "science", "war"
# ]

# topics = []
# batch_size = 16

# for i in tqdm.tqdm(range(0, len(df), batch_size)):
#     batch = df[text_column].iloc[i:i+batch_size].fillna("").tolist()

#     results = classifier(batch, candidate_labels)

#     for res in results:
#         topics.append(res["labels"][0])

# df["topic_label"] = topics

# # -------------------------------
# # 4Ô∏è‚É£ Location Extraction
# # -------------------------------
# print("\nüîπ Extracting locations...")

# nlp = spacy.load("en_core_web_sm")

# def extract_location(text):
#     doc = nlp(str(text))
#     locs = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
#     return locs[0] if locs else "Unknown"

# df["location"] = df[text_column].apply(extract_location)

# # -------------------------------
# # 5Ô∏è‚É£ Save File
# # -------------------------------
# output_path = r"D:\Anomaly_hypertext_news_detection\Dataset-20251117T051821Z-1-001\Dataset\Cleaned_Dataset.csv"
# df.to_csv(output_path, index=False, encoding="utf-8")

# print("\n‚úÖ Completed Successfully!")
# print(f"üìÅ File saved at:\n‚û° {output_path}")


In [62]:
df.columns


Index(['Article', 'Date', 'Heading', 'NewsType', 'embedding', 'cluster'], dtype='object')

In [63]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import tqdm

# Auto detect text column
possible_cols = ["cleaned_text", "text", "content", "article", "body", "news", "processed_text"]
text_column = None

for col in df.columns:
    if col.lower() in possible_cols:
        text_column = col
        break

if text_column is None:
    raise Exception("‚ùå No valid text column found. Please rename your text column to 'cleaned_text' or tell me its name.")

print(f"üìå Using text column: {text_column}")

print("üìå Loading lightweight topic classifier...")
model_name = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cpu")

labels = model.config.id2label
print("üîç Model labels:", labels)

def classify_topic(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        output = model(**tokens)
    scores = torch.softmax(output.logits, dim=1).numpy()[0]
    return labels[int(np.argmax(scores))]

print("‚ö° Assigning topics...")

df["topic_label"] = [
    classify_topic(text) if isinstance(text, str) and text.strip() else "Unknown"
    for text in tqdm.tqdm(df[text_column])
]

print("üéâ Topic labeling completed!")

# Save
df.to_csv("cleaned_final_dataset.csv", index=False)
print("üíæ Saved as cleaned_final_dataset.csv")


üìå Using text column: Article
üìå Loading lightweight topic classifier...
üîç Model labels: {0: 'arts_&_culture', 1: 'business_&_entrepreneurs', 2: 'celebrity_&_pop_culture', 3: 'diaries_&_daily_life', 4: 'family', 5: 'fashion_&_style', 6: 'film_tv_&_video', 7: 'fitness_&_health', 8: 'food_&_dining', 9: 'gaming', 10: 'learning_&_educational', 11: 'music', 12: 'news_&_social_concern', 13: 'other_hobbies', 14: 'relationships', 15: 'science_&_technology', 16: 'sports', 17: 'travel_&_adventure', 18: 'youth_&_student_life'}
‚ö° Assigning topics...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2692/2692 [06:46<00:00,  6.62it/s]


üéâ Topic labeling completed!
üíæ Saved as cleaned_final_dataset.csv


In [42]:
import numpy as np
import torch

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
print("Tensor ‚Üí NumPy test:", torch.tensor([1,2,3]).numpy())


NumPy: 1.26.4
Torch: 2.9.1+cpu
Tensor ‚Üí NumPy test: [1 2 3]


In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import numpy as np
import accelerate
print(accelerate.__version__)

model_name = "roberta-base"

# Load dataset
dataset = load_dataset("imdb")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized_ds = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Training Settings (FIXED: removed unsupported evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./news_class_model",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    logging_steps=20,
    save_steps=1000,
    no_cuda=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"].select(range(5000)),  # Smaller subset for CPU
    eval_dataset=tokenized_ds["test"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Manual evaluation (since old version doesn't auto_eval)
print("\nEvaluating model...")
results = trainer.evaluate()
print("Evaluation Results:", results)


1.12.0


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:16<00:00, 3052.34 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
20,0.0923
40,0.0002
60,0.0001
80,0.0001
100,0.0001
120,0.0001
140,0.0
160,0.0
180,0.0
200,0.0



Evaluating model...


Evaluation Results: {'eval_loss': 4.059545517520746e-06, 'eval_accuracy': 1.0, 'eval_runtime': 397.4412, 'eval_samples_per_second': 2.516, 'eval_steps_per_second': 0.315, 'epoch': 2.0}


In [38]:
import os

print(os.listdir())


['Cleaning.ipynb', 'news_class_model']


In [40]:
import os

print(os.listdir("news_class_model"))


['checkpoint-1000', 'checkpoint-1250']


In [42]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "news_class_model/checkpoint-1250"
save_dir = "news_class_model_final"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)

print("Model saved successfully to:", save_dir)


Model saved successfully to: news_class_model_final


In [44]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="news_class_model_final",
    tokenizer="news_class_model_final",
    device="cpu"
)

text = "India reports major breakthrough in science and technology..."
result = classifier(text)

print(result)


Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.9999954700469971}]


In [2]:
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli"
)

candidate_labels = [
    "politics", "sports", "technology", "crime", 
    "business", "entertainment", "health", "science", "war"
]

text = "Apple releases new AI powered chip for iPhones."

result = classifier(text, candidate_labels)
print(result)


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


{'sequence': 'Apple releases new AI powered chip for iPhones.', 'labels': ['technology', 'business', 'crime', 'science', 'sports', 'war', 'politics', 'health', 'entertainment'], 'scores': [0.3246600925922394, 0.16670523583889008, 0.11062277853488922, 0.10744946449995041, 0.08850058913230896, 0.07212882488965988, 0.05216379091143608, 0.0392000637948513, 0.03856910765171051]}


In [11]:
import os

hf_cache_dir = "D:/Anomaly_hypertext_news_detection/hf_cache"
os.environ["HF_HOME"] = hf_cache_dir          # Hugging Face main cache
os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir
os.environ["HF_DATASETS_CACHE"] = hf_cache_dir
os.environ["HF_METRICS_CACHE"] = hf_cache_dir


In [18]:
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# 1Ô∏è‚É£ News Type Classification
# -----------------------------
# HF token optional if model is public
hf_token = "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"  # Optional if model is public

news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,  # optional
    device=-1  # CPU
)

# Candidate labels for News Type
candidate_labels = [
    "technology news", 
    "sports news", 
    "political news", 
    "crime report", 
    "entertainment news",
    "financial/business news",
    "health and medicine",
    "science discovery",
    "war or conflict"
]

# -----------------------------
# 2Ô∏è‚É£ Sentiment Analysis (VADER)
# -----------------------------
sentiment_analyzer = SentimentIntensityAnalyzer()

# -----------------------------
# 3Ô∏è‚É£ Example Article
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."

# Predict News Type
news_result = news_classifier(article, candidate_labels)
news_type = news_result['labels'][0]
news_score = news_result['scores'][0]

# Predict Sentiment
scores = sentiment_analyzer.polarity_scores(article)
compound = scores['compound']

if compound >= 0.05:
    sentiment_label = "Positive"
elif compound <= -0.05:
    sentiment_label = "Negative"
else:
    sentiment_label = "Neutral"

# -----------------------------
# 4Ô∏è‚É£ Display Results
# -----------------------------
print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
print(f"Predicted Sentiment: {sentiment_label} (compound score: {compound})")


Device set to use cpu


Predicted News Type: war or conflict (score: 0.42)
Predicted Sentiment: Positive (compound score: 0.4019)


In [20]:
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# 1Ô∏è‚É£ Hugging Face Token (Optional if model is public)
# -----------------------------
hf_token = "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"

# -----------------------------
# 2Ô∏è‚É£ Zero-Shot News Type Classification
# -----------------------------
news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,  # optional
    device=-1  # CPU
)

# Refined candidate labels
candidate_labels = [
    "technology news",
    "science news",
    "business news",
    "politics news",
    "sports news",
    "entertainment news",
    "health news",
    "crime news",
    "environment news",
    "war or conflict news"
]

# -----------------------------
# 3Ô∏è‚É£ Sentiment Analysis (VADER)
# -----------------------------
sentiment_analyzer = SentimentIntensityAnalyzer()

# -----------------------------
# 4Ô∏è‚É£ Function to Analyze Article
# -----------------------------
def analyze_article(article_text):
    # News Type Prediction
    news_result = news_classifier(article_text, candidate_labels)
    news_type = news_result['labels'][0]
    news_score = news_result['scores'][0]

    # Sentiment Prediction
    scores = sentiment_analyzer.polarity_scores(article_text)
    compound = scores['compound']
    if compound >= 0.05:
        sentiment_label = "Positive"
    elif compound <= -0.05:
        sentiment_label = "Negative"
    else:
        sentiment_label = "Neutral"

    # Display Results
    print(f"Article: {article_text}")
    print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
    print(f"Predicted Sentiment: {sentiment_label} (compound score: {compound:.4f})")

# -----------------------------
# 5Ô∏è‚É£ Example Usage
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."
analyze_article(article)


Device set to use cpu


Article: Apple releases new AI-powered chip for iPhones, promising faster performance.
Predicted News Type: technology news (score: 0.43)
Predicted Sentiment: Positive (compound score: 0.4019)


In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# Paths
# -----------------------------
model_path = "./news_class_model_final"

# -----------------------------
# 1Ô∏è‚É£ Load your fine-tuned model
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create a sentiment analysis pipeline using your fine-tuned model
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=-1  # CPU
)

# -----------------------------
# 2Ô∏è‚É£ News Type Zero-Shot Classifier
# -----------------------------
hf_token = "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"  # Optional if model public
news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,
    device=-1  # CPU
)

candidate_labels = [
    "technology news", 
    "sports news", 
    "political news", 
    "crime report", 
    "entertainment news",
    "financial/business news",
    "health and medicine",
    "science discovery",
    "war or conflict"
]

# -----------------------------
# 3Ô∏è‚É£ Example Article
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."

# Predict News Type
news_result = news_classifier(article, candidate_labels)
news_type = news_result['labels'][0]
news_score = news_result['scores'][0]

# Predict Sentiment
sentiment_result = sentiment_analyzer(article)
sentiment_label = sentiment_result[0]['label']
sentiment_score = sentiment_result[0]['score']

# -----------------------------
# 4Ô∏è‚É£ Display Results
# -----------------------------
print(f"Article: {article}")
print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
print(f"Predicted Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")


Device set to use cpu
Device set to use cpu


Article: Apple releases new AI-powered chip for iPhones, promising faster performance.
Predicted News Type: war or conflict (score: 0.42)
Predicted Sentiment: LABEL_0 (score: 1.00)


In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# -----------------------------
# Paths
# -----------------------------
model_path = "./news_class_model_final"

# -----------------------------
# 1Ô∏è‚É£ Load your fine-tuned sentiment model
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Map label IDs to sentiment names (IMDB style)
label_mapping = {0: "Negative", 1: "Positive"}

# Sentiment pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=-1  # CPU
)

# -----------------------------
# 2Ô∏è‚É£ News Type Zero-Shot Classifier
# -----------------------------
hf_token = "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"  # optional if model public
news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,
    device=-1  # CPU
)

candidate_labels = [
    "technology news", 
    "sports news", 
    "political news", 
    "crime report", 
    "entertainment news",
    "financial/business news",
    "health and medicine",
    "science discovery",
    "war or conflict"
]

# -----------------------------
# 3Ô∏è‚É£ Example Article
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."

# -----------------------------
# 4Ô∏è‚É£ Predict News Type
# -----------------------------
news_result = news_classifier(article, candidate_labels)
news_type = news_result['labels'][0]
news_score = news_result['scores'][0]

# -----------------------------
# 5Ô∏è‚É£ Predict Sentiment
# -----------------------------
sentiment_result = sentiment_analyzer(article)
# Convert LABEL_0 / LABEL_1 to human-readable label
pred_label_id = int(sentiment_result[0]['label'].split("_")[1])
sentiment_label = label_mapping[pred_label_id]
sentiment_score = sentiment_result[0]['score']

# -----------------------------
# 6Ô∏è‚É£ Display Results
# -----------------------------
print(f"Article: {article}")
print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
print(f"Predicted Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")


Device set to use cpu
Device set to use cpu


Article: Apple releases new AI-powered chip for iPhones, promising faster performance.
Predicted News Type: war or conflict (score: 0.42)
Predicted Sentiment: Negative (score: 1.00)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# Paths
# -----------------------------
model_path = "./news_class_model_final"  # Your fine-tuned sentiment model

# -----------------------------
# 1Ô∏è‚É£ Load Fine-Tuned Sentiment Model
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=-1  # CPU
)

# VADER fallback analyzer
vader_analyzer = SentimentIntensityAnalyzer()

# Map numeric labels from your fine-tuned model
label_map = {"LABEL_0": "Negative", "LABEL_1": "Positive"}

# -----------------------------
# 2Ô∏è‚É£ Load Zero-Shot News Classifier
# -----------------------------
hf_token = "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"  # Optional if model public
news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,
    device=-1  # CPU
)

candidate_labels = [
    "technology news", 
    "sports news", 
    "political news", 
    "crime report", 
    "entertainment news",
    "financial/business news",
    "health and medicine",
    "science discovery",
    "war or conflict"
]

# -----------------------------
# 3Ô∏è‚É£ Example Article
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."

# -----------------------------
# 4Ô∏è‚É£ Predict News Type
# -----------------------------
news_result = news_classifier(article, candidate_labels)
news_type = news_result['labels'][0]
news_score = news_result['scores'][0]

# -----------------------------
# 5Ô∏è‚É£ Predict Sentiment
# -----------------------------
sentiment_result = sentiment_pipeline(article)
raw_label = sentiment_result[0]['label']
sentiment_label = label_map.get(raw_label, raw_label)
sentiment_score = sentiment_result[0]['score']

# Use VADER as fallback if confidence is low
if sentiment_score < 0.6:
    vader_scores = vader_analyzer.polarity_scores(article)
    compound = vader_scores['compound']
    if compound >= 0.05:
        sentiment_label = "Positive"
    elif compound <= -0.05:
        sentiment_label = "Negative"
    else:
        sentiment_label = "Neutral"
    sentiment_score = compound

# -----------------------------
# 6Ô∏è‚É£ Display Results
# -----------------------------
print(f"Article: {article}")
print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
print(f"Predicted Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")


Device set to use cpu
Device set to use cpu


Article: Apple releases new AI-powered chip for iPhones, promising faster performance.
Predicted News Type: war or conflict (score: 0.42)
Predicted Sentiment: Negative (score: 1.00)


In [3]:
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# 1Ô∏è‚É£ Hugging Face Zero-Shot Classifier
# -----------------------------
hf_token = "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"  # Optional if model public

news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,
    device=-1  # CPU
)

# Candidate labels (more relevant to tech/news)
candidate_labels = [
    "technology news", 
    "sports news", 
    "political news", 
    "crime report", 
    "entertainment news",
    "financial/business news",
    "health and medicine",
    "science discovery"
]

# -----------------------------
# 2Ô∏è‚É£ Sentiment Analysis (VADER)
# -----------------------------
sentiment_analyzer = SentimentIntensityAnalyzer()

# -----------------------------
# 3Ô∏è‚É£ Example Article
# -----------------------------
article = "Apple releases new AI-powered chip for iPhones, promising faster performance."

# -----------------------------
# 4Ô∏è‚É£ Predict News Type
# -----------------------------
news_result = news_classifier(article, candidate_labels)
news_type = news_result['labels'][0]
news_score = news_result['scores'][0]

# -----------------------------
# 5Ô∏è‚É£ Predict Sentiment
# -----------------------------
scores = sentiment_analyzer.polarity_scores(article)
compound = scores['compound']

if compound >= 0.05:
    sentiment_label = "Positive"
elif compound <= -0.05:
    sentiment_label = "Negative"
else:
    sentiment_label = "Neutral"

# -----------------------------
# 6Ô∏è‚É£ Display Results
# -----------------------------
print(f"Article: {article}")
print(f"Predicted News Type: {news_type} (score: {news_score:.2f})")
print(f"Predicted Sentiment: {sentiment_label} (compound score: {compound:.2f})")


Device set to use cpu


Article: Apple releases new AI-powered chip for iPhones, promising faster performance.
Predicted News Type: technology news (score: 0.56)
Predicted Sentiment: Positive (compound score: 0.40)


In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "MoritzLaurer/deberta-v3-base-mnli"

# Load model from HF
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Correct path to save
save_path = r"D:\Anomaly_hypertext_news_detection\Models"

# Save model + tokenizer locally
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("üéâ Model saved successfully at:", save_path)


üéâ Model saved successfully at: D:\Anomaly_hypertext_news_detection\Models


In [9]:
from transformers import pipeline

saved_model_path = r"D:\Anomaly_hypertext_news_detection\Models"

news_classifier = pipeline(
    "zero-shot-classification",
    model=saved_model_path,
    tokenizer=saved_model_path,
    device=-1  # CPU
)

print("Model Loaded Locally!")


The tokenizer you are loading from 'D:\Anomaly_hypertext_news_detection\Models' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Device set to use cpu


Model Loaded Locally!


In [None]:
# ==============================
# üì∞ Advanced News Analysis Script
# ==============================

from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re

# ----------------------------
# 1Ô∏è‚É£ ZERO-SHOT MULTI-CATEGORY MODEL
# ----------------------------
hf_token = "#######################"

news_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-mnli",
    use_auth_token=hf_token,
    device=-1  # CPU mode
)

candidate_labels = [
    "technology news", "sports news", "political news", "crime report",
    "entertainment news","financial/business news",
    "health and medicine","science discovery","travel news",
    "breaking news","disaster news","protest news",
    "environmental news","political/social news"
]

# ----------------------------
# 2Ô∏è‚É£ SENTIMENT ANALYZER
# ----------------------------
sentiment_analyzer = SentimentIntensityAnalyzer()

# ----------------------------
# 3Ô∏è‚É£ ANOMALY DETECTOR
# ----------------------------
BASELINE_PATH = r"D:\Anomaly_hypertext_news_detection\Preprocessing\cleaned_final_dataset.csv"

class NewsAnomalyDetector:
    def __init__(self):
        df = pd.read_csv(BASELINE_PATH)
        possible_cols = ["cleaned_text", "text", "content", "article", "news"]
        text_col = next((c for c in df.columns if c.lower() in possible_cols), None)
        if text_col is None:
            raise Exception("No usable text column found in CSV")
        self.baseline_news = df[text_col].astype(str).tolist()
        self.vectorizer = TfidfVectorizer(stop_words="english")
        self.baseline_vectors = self.vectorizer.fit_transform(self.baseline_news)

    def detect(self, article):
        article_vector = self.vectorizer.transform([article])
        similarity_scores = cosine_similarity(article_vector, self.baseline_vectors).flatten()
        avg_similarity = np.mean(similarity_scores)
        anomaly_score = 1 - avg_similarity
        return anomaly_score, float(avg_similarity)

detector = NewsAnomalyDetector()

# ----------------------------
# 4Ô∏è‚É£ CONTENT TYPE DETECTION
# ----------------------------
def detect_content_type(article):
    if len(article.split()) < 20:
        if re.search(r"[.!?]$", article.strip()):
            return "üü¢ Short News Headline"
        else:
            return "‚ö™ Personal / Non-News Statement"
    else:
        return "üîµ Normal News Article"

# ----------------------------
# 5Ô∏è‚É£ ANOMALY SCORING FUNCTION
# ----------------------------
def compute_anomaly(article, detector, multi_labels):
    base_anomaly, similarity = detector.detect(article)
    
    # Personal/clickbait detection
    personal_keywords = ["I", "my", "we", "our", "us", "personal", "exclusive", "shock"]
    clickbait_factor = 0.05 if any(word.lower() in article.lower() for word in personal_keywords) else 0.0

    # Multi-perspective weighting
    low_confidence_labels = [score for _, score in multi_labels if score < 0.1]
    label_factor = sum(low_confidence_labels) * 0.2

    # Final anomaly score
    anomaly_score = base_anomaly + clickbait_factor + label_factor
    anomaly_score = min(anomaly_score, 1.0)

    # Map to overall status
    if anomaly_score <= 0.3:
        status = "üü¢ Normal News"
    elif anomaly_score <= 0.6:
        status = "üü° Rare / Needs Review"
    elif anomaly_score <= 0.85:
        status = "üîµ Unusual / Rare Event"
    else:
        status = "üî¥ Highly Deviating / Possibly Fake"

    return anomaly_score, similarity, status

# ----------------------------
# 6Ô∏è‚É£ USER INPUT
# ----------------------------
article = input("\nEnter News Article:\n")

# ----------------------------
# 7Ô∏è‚É£ CATEGORY PREDICTION
# ----------------------------
news_result = news_classifier(article, candidate_labels)
multi_labels = list(zip(news_result['labels'], news_result['scores']))
predicted_type = multi_labels[0][0]
confidence = round(multi_labels[0][1], 3)

# ----------------------------
# 8Ô∏è‚É£ SENTIMENT ANALYSIS
# ----------------------------
sentiment_scores = sentiment_analyzer.polarity_scores(article)
compound_score = sentiment_scores['compound']

if compound_score >= 0.05:
    sentiment_label = "Positive"
elif compound_score <= -0.05:
    sentiment_label = "Negative"
else:
    sentiment_label = "Neutral"

# ----------------------------
# 9Ô∏è‚É£ CONTENT TYPE
# ----------------------------
content_type = detect_content_type(article)

# ----------------------------
# üîü ANOMALY SCORING
# ----------------------------
anomaly_score, similarity_score, overall_status = compute_anomaly(article, detector, multi_labels)

# ----------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ FINAL REPORT
# ----------------------------
print("\n================= üìå NEWS ANALYSIS REPORT =================\n")
print(f"üì∞ Text Analyzed: {article}\n")
print(f"üìå Predicted Category: {predicted_type}  (Confidence: {confidence})")
print(f"üí¨ Sentiment: {sentiment_label}  (VADER Score: {compound_score})")
print(f"üö® Content Assessment: {content_type}")
print(f"üîç Similarity Score: {similarity_score:.3f}")
print(f"‚ö† Anomaly Score: {anomaly_score:.3f}")
print(f"üìå Overall Status: {overall_status}")
print(f"üìò Multi-Perspective Categories: {multi_labels}")
print("\n===========================================================\n")






üì∞ Text Analyzed: Madhya Pradesh Farmers' Day-Long Protest At Highway Over MSP Suspended

üìå Predicted Category: protest news  (Confidence: 0.65)
üí¨ Sentiment: Negative  (VADER Score: -0.6249)
üö® Content Assessment: ‚ö™ Personal / Non-News Statement
üîç Similarity Score: 0.005
‚ö† Anomaly Score: 1.000
üìå Overall Status: üî¥ Highly Deviating / Possibly Fake
üìò Multi-Perspective Categories: [('protest news', 0.6504061818122864), ('travel news', 0.09086829423904419), ('political/social news', 0.08618099242448807), ('financial/business news', 0.05310504138469696), ('political news', 0.04636052995920181), ('technology news', 0.016080709174275398), ('breaking news', 0.012706955894827843), ('health and medicine', 0.008851170539855957), ('crime report', 0.007686261087656021), ('environmental news', 0.006726773921400309), ('science discovery', 0.0060768332332372665), ('disaster news', 0.005817578174173832), ('entertainment news', 0.005346919875591993), ('sports news', 0.00378574

In [None]:
# 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