In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re

In [None]:
# --- Step 1: Data Loading and Preparation ---

# If your data is in a CSV file, uncomment the line below and replace 'your_dataset.csv'
df = pd.read_csv(r'training_data.csv')

print("--- Sample Dataset ---")
print(df.head())
print("\n")

In [None]:
# --- Step 2: Text Preprocessing ---

def preprocess_text(text):
    """
    Cleans the text by converting to lowercase and removing punctuation.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    return text

df['text'] = df['text'].apply(preprocess_text)

print("--- Preprocessed Text ---")
print(df.head())
print("\n")

In [None]:
# --- Step 3: Feature Extraction ---

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Initialize the TF-IDF Vectorizer
# TF-IDF stands for Term Frequency-Inverse Document Frequency. It converts text
# into a matrix of numerical features. It gives more weight to words that are
# unique to a document and less to common words like "the" or "a".
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Fit and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the test data using the *same* vectorizer
X_test_vec = vectorizer.transform(X_test)

print(f"Shape of training data matrix: {X_train_vec.shape}")
print(f"Shape of testing data matrix: {X_test_vec.shape}")
print("\n")

In [None]:
# --- Step 4: Model Training ---

# We'll use a Multinomial Naive Bayes classifier, which is a good baseline
# for text classification tasks.
model = MultinomialNB()
model.fit(X_train_vec, y_train)

print("--- Model Training Complete ---")
print("\n")

In [None]:
# --- Step 5: Model Evaluation ---

# Predict on the test data
y_pred = model.predict(X_test_vec)

print("--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# --- Step 6: Prediction on New Data ---

def predict_headline(headline_text):
    """
    Takes a new headline, preprocesses it, and predicts its label.
    """
    # Preprocess the new text
    cleaned_text = preprocess_text(headline_text)
    
    # Vectorize the cleaned text
    # We use .transform() here, NOT .fit_transform()
    new_text_vec = vectorizer.transform([cleaned_text])
    
    # Make a prediction
    prediction = model.predict(new_text_vec)

    if prediction[0] == 0:
        return 'World'
    elif prediction[0] == 1:
        return 'Sports'
    elif prediction[0] == 2:
        return 'Business'
    else:
        return 'Tech/Sci-fi'

In [24]:
# --- Step 7: Test Cases---
print("--- Predictions on New Headlines ---")

new_headline_1 = "The S&P 500 futures rise on tech stock gains"
print(f"Headline: '{new_headline_1}' -> Predicted Label: {predict_headline(new_headline_1)}")

new_headline_2 = "Olympic champion breaks world record in swimming"
print(f"Headline: '{new_headline_2}' -> Predicted Label: {predict_headline(new_headline_2)}")

new_headline_3 = "Researchers study the effects of a new medical treatment"
print(f"Headline: '{new_headline_3}' -> Predicted Label: {predict_headline(new_headline_3)}")

# --- Tricky Test Case ---
tricky_headline = "Breakthrough in renewable energy research drives global stock market surge"
print(f"Headline: '{tricky_headline}' -> Predicted Label: {predict_headline(tricky_headline)}")

--- Predictions on New Headlines ---
Headline: 'The S&P 500 futures rise on tech stock gains' -> Predicted Label: Business
Headline: 'Olympic champion breaks world record in swimming' -> Predicted Label: Sports
Headline: 'Researchers study the effects of a new medical treatment' -> Predicted Label: Tech/Sci-fi
Headline: 'Breakthrough in renewable energy research drives global stock market surge' -> Predicted Label: Business
