# 📌 Extended Example: Unstructured Data (Online News Articles Dataset - 20 Newsgroups)

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import re
import string

## ✅ Load Dataset

In [None]:
categories = ['sci.space', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})
df.head(3)

## ✅ Clean and Preprocess Text

In [None]:
import re

def clean_text(text):
    # Remove all non-letter characters
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Replace multiple whitespace with a single space
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply the function to the text column
df['clean_text'] = df['text'].apply(clean_text)

# Tokenize by splitting on whitespace
df['tokens'] = df['clean_text'].str.split()

# Display the result
df[['clean_text', 'tokens']].head()


## ✅ Word Frequency Analysis

In [None]:
from collections import Counter

all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)
word_freq.most_common(10)

## ✅ TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.head()

## ✅ Classification Example (Logistic Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(tfidf_df, df['target'], test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))