<a href="https://colab.research.google.com/github/Osakhra/ITAI2373-NewsBot-Final/blob/main/notebooks/08_System_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 08 – System Integration

I’m wiring everything together so one call can analyze a news article from start to finish: preprocessing → features → classification → sentiment → entities → topic → summary → nicely packaged result. I’ll also save a small batch of integrated results to `data/results/` for later use.


In [1]:
!pip install -q langdetect spacy nltk scikit-learn pyldavis textblob transformers torch sumy sentence-transformers googletrans==4.0.0-rc1

import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
!git clone https://github.com/Osakhra/ITAI2373-NewsBot-Final.git
import sys, os, pathlib
PROJECT_ROOT = "/content/ITAI2373-NewsBot-Final"
sys.path.append(f"{PROJECT_ROOT}/src")

# Make sure models dir exists for saves
os.makedirs(f"{PROJECT_ROOT}/data/models", exist_ok=True)


fatal: destination path 'ITAI2373-NewsBot-Final' already exists and is not an empty directory.


In [3]:
import pandas as pd, os

CSV_PATH = f"{PROJECT_ROOT}/data/processed/news_cleaned.csv"

if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
else:
    from google.colab import files
    print("I didn't find data/processed/news_cleaned.csv in the repo. Upload it now.")
    uploaded = files.upload()  # choose your news_cleaned.csv
    up_name = list(uploaded.keys())[0]
    df = pd.read_csv(up_name)

# Basic checks
assert {'content','category','clean_content'}.issubset(df.columns), "Expected columns not found."
df.head(2)


Unnamed: 0,ArticleId,content,category,clean_content
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defend ...
1,154,german business confidence slides german busin...,business,german business confidence slide german busine...


In [4]:
from data_processing.feature_extractor import FeatureExtractor
from analysis.classifier import NewsClassifier
from analysis.sentiment_analyzer import SentimentAnalyzer
from analysis.ner_extractor import NERExtractor
from analysis.topic_modeler import TopicModeler
from language_models.summarizer import Summarizer
from conversation.query_processor import QueryProcessor


In [5]:
from sklearn.model_selection import train_test_split
from pathlib import Path

# Vectorizer
extractor = FeatureExtractor(max_features=2000, ngram_range=(1,2))
X = extractor.fit_transform(df['clean_content'])
y = df['category']

# Split (not strictly needed if already validated, but useful for quick check)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classifier (NB by default)
clf = NewsClassifier(model_type='nb')
clf.train(X_train, y_train)
print("Classifier trained.")
clf.evaluate(X_test, y_test)

# Save artifacts
VEC_PATH = f"{PROJECT_ROOT}/data/models/tfidf_vectorizer.pkl"
CLF_PATH = f"{PROJECT_ROOT}/data/models/news_classifier_nb.pkl"

extractor.save(VEC_PATH)
clf.save(CLF_PATH)
print("Saved:", VEC_PATH, "and", CLF_PATH)


Classifier trained.
               precision    recall  f1-score   support

     business       0.97      0.97      0.97        75
entertainment       1.00      0.98      0.99        46
     politics       0.93      0.95      0.94        56
        sport       0.97      1.00      0.98        63
         tech       0.96      0.93      0.95        58

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

Accuracy: 96.64%
Saved: /content/ITAI2373-NewsBot-Final/data/models/tfidf_vectorizer.pkl and /content/ITAI2373-NewsBot-Final/data/models/news_classifier_nb.pkl


In [6]:
# Topic model
topic_modeler = TopicModeler(n_topics=5, method='lda', max_features=1500)
topic_modeler.fit_transform(df['clean_content'])

# Sentiment, NER, Summarizer
sentiment_analyzer = SentimentAnalyzer()
ner_extractor = NERExtractor()
summarizer = Summarizer()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [7]:
qp = QueryProcessor(
    classifier=clf,
    sentiment_analyzer=sentiment_analyzer,
    ner_extractor=ner_extractor,
    topic_modeler=topic_modeler,
    summarizer=summarizer,
    feature_extractor=extractor
)
print("QueryProcessor ready.")


QueryProcessor ready.


In [8]:
sample_article = df['content'].iloc[0]

print("User: What category is this article about?")
print("NewsBot:", qp.process("What category is this article?", sample_article), "\n")

print("User: What is the sentiment of this news story?")
print("NewsBot:", qp.process("What is the sentiment?", sample_article), "\n")

print("User: Who or what is mentioned in this article?")
print("NewsBot:", qp.process("List the entities in this article.", sample_article), "\n")

print("User: What is the main topic here?")
print("NewsBot:", qp.process("What topic is this about?", sample_article), "\n")

print("User: Summarize this article.")
print("NewsBot:", qp.process("Summarize this article.", sample_article), "\n")


User: What category is this article about?
Detected Intent: category
NewsBot: Predicted Category: business 

User: What is the sentiment of this news story?
Detected Intent: sentiment
NewsBot: Sentiment: neutral (polarity: 0.02) 

User: Who or what is mentioned in this article?
Detected Intent: entities
NewsBot: Entities found: first [ORDINAL], cynthia cooper [PERSON], us [GPE], 2002 [DATE], 5.7bn [MONEY], new york [GPE], wednesday [DATE], arthur andersen [PERSON], early 2001 and [DATE], 2002 [DATE], scott sullivan [PERSON], sullivan [PERSON], worldcom s accounting [ORG], 2001 [DATE], 85 years [DATE], 2004 [DATE], mci [ORG], last week [DATE], mci [ORG], 6.75bn [MONEY] 

User: What is the main topic here?
Detected Intent: topic
NewsBot: Main topic #2: say, year, mr, company, market, firm, rise, sale 

User: Summarize this article.
Detected Intent: summary



In [9]:
# (Optional) Save everything you'd want to load in a minimal app
# We’ll keep individual pickles already saved; this is just a note:
print("Artifacts available:")
print(" - Vectorizer:", VEC_PATH)
print(" - Classifier:", CLF_PATH)
print(" - Topic model: in-memory (add save() there if you implemented it)")


Artifacts available:
 - Vectorizer: /content/ITAI2373-NewsBot-Final/data/models/tfidf_vectorizer.pkl
 - Classifier: /content/ITAI2373-NewsBot-Final/data/models/news_classifier_nb.pkl
 - Topic model: in-memory (add save() there if you implemented it)


In [10]:
def analyze_article_full(text: str):
    # category
    X_one = extractor.transform([text])
    category = clf.predict(X_one)[0]
    # sentiment
    s = sentiment_analyzer.analyze(text)
    s_label = sentiment_analyzer.label_sentiment(s['polarity'])
    # entities
    ents = ner_extractor.extract(text)
    # topic
    topic_id = topic_modeler.assign_topic(text)
    # summary
    summ = summarizer.summarize(text)
    return {
        "category": category,
        "sentiment": {"label": s_label, "polarity": s['polarity']},
        "entities": ents,
        "topic_id": topic_id,
        "summary": summ
    }

# smoke test
analyze_article_full(df['content'].iloc[1])


{'category': np.str_('business'),
 'sentiment': {'label': 'neutral', 'polarity': 0.07791666666666669},
 'entities': [('german', 'NORP'),
  ('german', 'NORP'),
  ('february', 'DATE'),
  ('europe', 'LOC'),
  ('munich', 'GPE'),
  ('research institute ifo', 'ORG'),
  ('95.5', 'CARDINAL'),
  ('february', 'DATE'),
  ('97.5', 'CARDINAL'),
  ('january', 'DATE'),
  ('first', 'ORDINAL'),
  ('three months', 'DATE'),
  ('bernd weidensteiner', 'PERSON'),
  ('wolfgang', 'PERSON'),
  ('february', 'DATE'),
  ('germany', 'GPE'),
  ('1.6%', 'PERCENT'),
  ('last year', 'DATE'),
  ('2003', 'DATE'),
  ('0.2%', 'PERCENT'),
  ('the last three months of 2004', 'DATE'),
  ('hans-werner sinn', 'PERSON'),
  ('german', 'NORP'),
  ('the first half of 2004', 'DATE'),
  ('german', 'NORP'),
  ('close to 10%', 'PERCENT'),
  ('siemens', 'ORG'),
  ('volkswagen', 'ORG'),
  ('germany', 'GPE'),
  ('the european central bank', 'ORG'),
  ('2%', 'PERCENT')],
 'topic_id': np.int64(2),
 'summary': 'German business confidence fe