In [None]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# === NLTK setup ===
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# === Step 1: Load JSON lines ===
with open('news_categories.json', 'r') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df = df[['headline', 'category']].dropna()

# === Step 2: Preprocess headlines ===
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(t) for t in tokens if t.isalpha() and t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['clean'] = df['headline'].apply(preprocess)

# === Step 3: Encode categories ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# === Step 5: Build pipeline with TF-IDF and LinearSVC ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=5,
        max_df=0.9,
        ngram_range=(1, 2),
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', OneVsRestClassifier(
        LinearSVC(class_weight='balanced', max_iter=5000)
    ))
])

# === Step 6: Train and evaluate ===
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# precision    recall  f1-score   support

#           ARTS       0.19      0.23      0.21       151
# ARTS & CULTURE       0.32      0.22      0.26       134
#   BLACK VOICES       0.47      0.50      0.48       458
#       BUSINESS       0.39      0.46      0.43       599
#        COLLEGE       0.34      0.34      0.34       114
#         COMEDY       0.43      0.44      0.44       540
#          CRIME       0.52      0.51      0.51       356
# CULTURE & ARTS       0.32      0.22      0.26       107
#        DIVORCE       0.66      0.68      0.67       343
#      EDUCATION       0.31      0.31      0.31       101
#  ENTERTAINMENT       0.67      0.66      0.66      1736
#    ENVIRONMENT       0.34      0.26      0.29       144
#          FIFTY       0.17      0.19      0.18       140
#   FOOD & DRINK       0.59      0.62      0.60       634
#      GOOD NEWS       0.21      0.18      0.19       140
#          GREEN       0.33      0.39      0.36       262
# HEALTHY LIVING       0.25      0.29      0.26       670
#  HOME & LIVING       0.65      0.62      0.64       432
#         IMPACT       0.22      0.31      0.26       348
#  LATINO VOICES       0.45      0.31      0.37       113
#          MEDIA       0.41      0.41      0.41       294
# ...
#       accuracy                           0.54     20953
#      macro avg       0.43      0.42      0.42     20953
#   weighted avg       0.55      0.54      0.54     20953


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:

                precision    recall  f1-score   support

          ARTS       0.19      0.23      0.21       151
ARTS & CULTURE       0.32      0.22      0.26       134
  BLACK VOICES       0.47      0.50      0.48       458
      BUSINESS       0.39      0.46      0.43       599
       COLLEGE       0.34      0.34      0.34       114
        COMEDY       0.43      0.44      0.44       540
         CRIME       0.52      0.51      0.51       356
CULTURE & ARTS       0.32      0.22      0.26       107
       DIVORCE       0.66      0.68      0.67       343
     EDUCATION       0.31      0.31      0.31       101
 ENTERTAINMENT       0.67      0.66      0.66      1736
   ENVIRONMENT       0.34      0.26      0.29       144
         FIFTY       0.17      0.19      0.18       140
  FOOD & DRINK       0.59      0.62      0.60       634
     GOOD NEWS       0.21      0.18      0.19       140
         GREEN       0.33      0.39      0.36       262
HEALTHY LIVING       0.

In [None]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.sparse import hstack, identity
from scipy.sparse.linalg import lsqr

# === NLTK setup ===
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# === Step 1: Load JSON lines ===
with open('news_categories.json', 'r') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df = df[['headline', 'category']].dropna()

# === Step 2: Preprocess headlines ===
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(t) for t in tokens if t.isalpha() and t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['clean'] = df['headline'].apply(preprocess)

# === Step 3: Encode categories ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# === Step 5: Custom Sparse-Compatible LS-TWSVM ===
class LSTWSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, c1=0.01, c2=0.01):
        self.c1 = c1
        self.c2 = c2

    def fit(self, X, y):
        y = np.where(y == 1, 1, -1)
        X1 = X[y == 1]
        X2 = X[y == -1]

        e1 = np.ones((X1.shape[0], 1))
        e2 = np.ones((X2.shape[0], 1))

        A = hstack([X1, e1])
        B = hstack([X2, e2])
        I = identity(A.shape[1])

        # Solve least squares with regularization
        lhs1 = B.T @ B + self.c1 * I
        rhs1 = B.T @ e2
        self.w1 = lsqr(lhs1, rhs1)[0]

        lhs2 = A.T @ A + self.c2 * I
        rhs2 = A.T @ e1
        self.w2 = lsqr(lhs2, rhs2)[0]

        return self

    def decision_function(self, X):
        bias = np.ones((X.shape[0], 1))
        X_aug = hstack([X, bias])
        f1 = np.abs(X_aug @ self.w1)
        f2 = np.abs(X_aug @ self.w2)
        return f2 - f1

    def predict(self, X):
        return np.where(self.decision_function(X) < 0, 1, 0)

# === Step 6: Build Pipeline with TF-IDF and One-vs-Rest ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=5,
        max_df=0.9,
        ngram_range=(1, 2),
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', OneVsRestClassifier(LSTWSVM()))
])

# === Step 7: Train and evaluate ===
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

#                 precision    recall  f1-score   support

#           ARTS       0.13      0.17      0.14       302
# ARTS & CULTURE       0.17      0.12      0.14       268
#   BLACK VOICES       0.23      0.24      0.23       917
#       BUSINESS       0.20      0.28      0.23      1198
#        COLLEGE       0.26      0.17      0.21       229
#         COMEDY       0.15      0.22      0.18      1080
#          CRIME       0.38      0.26      0.30       712
# CULTURE & ARTS       0.24      0.20      0.22       215
#        DIVORCE       0.45      0.32      0.37       685
#      EDUCATION       0.22      0.20      0.21       203
#  ENTERTAINMENT       0.42      0.45      0.43      3473
#    ENVIRONMENT       0.28      0.19      0.23       289
#          FIFTY       0.06      0.09      0.07       280
#   FOOD & DRINK       0.45      0.38      0.41      1268
#      GOOD NEWS       0.16      0.12      0.14       280
#          GREEN       0.28      0.24      0.26       524
# HEALTHY LIVING       0.13      0.19      0.16      1339
#  HOME & LIVING       0.52      0.34      0.41       864
#         IMPACT       0.11      0.13      0.12       697
#  LATINO VOICES       0.21      0.15      0.18       226
#          MEDIA       0.27      0.31      0.29       589
# ...
#       accuracy                           0.32     41906
#      macro avg       0.28      0.25      0.26     41906
#   weighted avg       0.35      0.32      0.33     41906

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:

                precision    recall  f1-score   support

          ARTS       0.13      0.17      0.14       302
ARTS & CULTURE       0.17      0.12      0.14       268
  BLACK VOICES       0.23      0.24      0.23       917
      BUSINESS       0.20      0.28      0.23      1198
       COLLEGE       0.26      0.17      0.21       229
        COMEDY       0.15      0.22      0.18      1080
         CRIME       0.38      0.26      0.30       712
CULTURE & ARTS       0.24      0.20      0.22       215
       DIVORCE       0.45      0.32      0.37       685
     EDUCATION       0.22      0.20      0.21       203
 ENTERTAINMENT       0.42      0.45      0.43      3473
   ENVIRONMENT       0.28      0.19      0.23       289
         FIFTY       0.06      0.09      0.07       280
  FOOD & DRINK       0.45      0.38      0.41      1268
     GOOD NEWS       0.16      0.12      0.14       280
         GREEN       0.28      0.24      0.26       524
HEALTHY LIVING       0.

In [2]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# === NLTK setup ===
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# === Step 1: Load JSON lines ===
with open('news_categories.json', 'r') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df = df[['headline', 'category']].dropna()

# === Step 2: Preprocess headlines ===
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(t) for t in tokens if t.isalpha() and t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['clean'] = df['headline'].apply(preprocess)

# === Step 3: Encode categories ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# === Step 5: TF-IDF + RidgeClassifier (LS-SVM Approximation) ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=5,
        max_df=0.9,
        ngram_range=(1, 2),
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', OneVsRestClassifier(
        RidgeClassifier(alpha=1.0)
    ))
])

# === Step 6: Train and evaluate ===
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:

                precision    recall  f1-score   support

          ARTS       0.42      0.21      0.28       302
ARTS & CULTURE       0.35      0.15      0.21       268
  BLACK VOICES       0.56      0.42      0.48       917
      BUSINESS       0.50      0.44      0.47      1198
       COLLEGE       0.52      0.45      0.48       229
        COMEDY       0.60      0.41      0.49      1080
         CRIME       0.51      0.54      0.52       712
CULTURE & ARTS       0.59      0.22      0.32       215
       DIVORCE       0.78      0.65      0.71       685
     EDUCATION       0.47      0.32      0.38       203
 ENTERTAINMENT       0.62      0.74      0.67      3473
   ENVIRONMENT       0.48      0.20      0.29       289
         FIFTY       0.44      0.13      0.20       280
  FOOD & DRINK       0.59      0.71      0.64      1268
     GOOD NEWS       0.37      0.13      0.19       280
         GREEN       0.40      0.32      0.36       524
HEALTHY LIVING       0.

In [10]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# === NLTK setup ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# === Step 1: Load JSON lines ===
with open('news_categories.json', 'r') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df = df[['headline', 'category']].dropna()

# === Step 2: Preprocess headlines (with lemmatization) ===
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['clean'] = df['headline'].apply(preprocess)

# === Step 3: Encode categories ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# === Step 5: TF-IDF + RidgeClassifier (LS-SVM Approximation) ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=5,
        max_df=0.9,
        ngram_range=(1, 2),
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', OneVsRestClassifier(
        RidgeClassifier(alpha=1.0)
    ))
])

# === Step 6: Train and evaluate ===
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Classification Report:

                precision    recall  f1-score   support

          ARTS       0.41      0.21      0.27       302
ARTS & CULTURE       0.38      0.16      0.23       268
  BLACK VOICES       0.55      0.42      0.48       917
      BUSINESS       0.50      0.44      0.47      1198
       COLLEGE       0.53      0.45      0.49       229
        COMEDY       0.60      0.41      0.49      1080
         CRIME       0.51      0.53      0.52       712
CULTURE & ARTS       0.62      0.21      0.32       215
       DIVORCE       0.77      0.63      0.69       685
     EDUCATION       0.47      0.33      0.39       203
 ENTERTAINMENT       0.62      0.75      0.68      3473
   ENVIRONMENT       0.47      0.19      0.27       289
         FIFTY       0.44      0.15      0.23       280
  FOOD & DRINK       0.58      0.71      0.64      1268
     GOOD NEWS       0.32      0.12      0.17       280
         GREEN       0.41      0.32      0.36       524
HEALTHY LIVING       0.

In [9]:
import json
import pandas as pd
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# === NLTK setup ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# === Load JSON lines ===
with open('news_categories.json', 'r') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df = df[['headline', 'category']].dropna()

# === Preprocessing Function ===
ALLOWED_POS = {'NN', 'NNS', 'NNP', 'JJ', 'VB', 'VBD', 'VBG'}

def preprocess(text):
    # Lowercase and remove non-word characters
    text = re.sub(r'\W+', ' ', text.lower())

    # Tokenize and POS tag
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    # Lemmatize and filter
    clean_tokens = [
        lemmatizer.lemmatize(word)
        for word, tag in tagged
        if word.isalpha() and word not in stop_words and len(word) > 2 and tag in ALLOWED_POS
    ]

    return ' '.join(clean_tokens)

# === Apply preprocessing ===
df['clean'] = df['headline'].apply(preprocess)

# === Encode labels ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# === Classification pipeline ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=5,
        max_df=0.9,
        max_features=10000,
        ngram_range=(1, 2),
        sublinear_tf=True,
        norm='l2'
    )),
    ('clf', OneVsRestClassifier(
        RidgeClassifier(alpha=1.0)
    ))
])

# === Train and evaluate ===
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cecepasinechka/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Classification Report:

                precision    recall  f1-score   support

          ARTS       0.39      0.14      0.21       302
ARTS & CULTURE       0.39      0.17      0.24       268
  BLACK VOICES       0.51      0.37      0.43       917
      BUSINESS       0.48      0.39      0.43      1198
       COLLEGE       0.49      0.41      0.45       229
        COMEDY       0.57      0.34      0.43      1080
         CRIME       0.46      0.48      0.47       712
CULTURE & ARTS       0.54      0.20      0.29       215
       DIVORCE       0.78      0.59      0.67       685
     EDUCATION       0.43      0.30      0.35       203
 ENTERTAINMENT       0.57      0.71      0.63      3473
   ENVIRONMENT       0.44      0.17      0.24       289
         FIFTY       0.42      0.12      0.18       280
  FOOD & DRINK       0.55      0.69      0.61      1268
     GOOD NEWS       0.32      0.09      0.14       280
         GREEN       0.41      0.32      0.36       524
HEALTHY LIVING       0.