# 70 · NLP Email Anomalies
        
        Detect phishing/abuse emails with a baseline TF-IDF + logistic regression model.
        - Build a labeled dataset from the Kaggle Fake/Real news dump.
        - Inspect class balance and sample messages.
        - Train the baseline used elsewhere in the codebase.


In [None]:
from pathlib import Path
import sys
import pandas as pd
import matplotlib.pyplot as plt

project_root = Path('..').resolve()
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from uais.nlp.train_text_classifier import NLPConfig, run_text_experiment
print('Project root:', project_root)


In [None]:
# Build labeled Fake/True dataset (downloaded via KaggleHub)
fake_root = project_root / 'data' / 'raw' / 'nlp' / 'fakenews' / 'datasets' / 'clmentbisaillon' / 'fake-and-real-news-dataset' / 'versions' / '1'
fake_path = fake_root / 'Fake.csv'
true_path = fake_root / 'True.csv'
if not (fake_path.exists() and true_path.exists()):
    raise FileNotFoundError(f'Missing Fake/True news CSVs under {fake_root}')

df = (
    pd.concat([pd.read_csv(fake_path).assign(label=1), pd.read_csv(true_path).assign(label=0)], ignore_index=True)
    .rename(columns={'text': 'content'})[['content', 'label']]
)
print('Rows:', len(df), ' | Columns:', list(df.columns))
print(df.head())

# Persist combined CSV for reuse
out_csv = project_root / 'data' / 'raw' / 'nlp' / 'fakenews' / 'fake_news_labeled.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_csv, index=False)


In [None]:
# Class balance
counts = df['label'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(5,3))
counts.plot(kind='bar', ax=ax, color=['steelblue','tomato'])
ax.set_xticklabels(['real','fake'], rotation=0)
ax.set_ylabel('count')
ax.set_title('Label distribution')
plt.show()

print('Sample texts:')
print(df.sample(3, random_state=42))


In [None]:
# Train baseline TF-IDF + logistic regression
config = NLPConfig(
    dataset_path=out_csv,
    text_column='content',
    label_column='label',
    model_name='distilbert-base-uncased',
)
results = run_text_experiment(config)
print('Metrics:', results)
