<a href="https://colab.research.google.com/github/Noahlie07/Newspaper-Article-Classification-Model/blob/main/Newspaper_Article_Classification_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Newspaper Article Classification Model

## Part I: Text Preprocessing

In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# Load in the dataset
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)

# Dropping irrelevant columns
df = df.drop(columns=["link", "authors", "date"])

# Dropping duplicate rows and rows containing missing values
df = df.drop_duplicates()
df = df.dropna()

# Load in the necessary NLTK tools
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the stop words and Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text Preprocessing
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["short_description"] = df["short_description"].apply(text_preprocessing)
df["headline"] = df["headline"].apply(text_preprocessing)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Part II: Feature Engineering

In [7]:
# Label Encoding for target variable
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(df[['short_description', 'headline']], df['category_encoded'], test_size=0.2, random_state=7)

# Converting Description and Headlines into numerical representations
vectorizer = CountVectorizer()
X_train_short_desc = vectorizer.fit_transform(X_train['short_description'])
X_test_short_desc = vectorizer.transform(X_test['short_description'])
X_train_headline = vectorizer.fit_transform(X_train['headline'])
X_test_headline = vectorizer.transform(X_test['headline'])

Part III: Machine Learning

In [8]:
# Combining the feature matrices
X_train_combined = hstack([X_train_short_desc, X_train_headline])
X_test_combined = hstack([X_test_short_desc, X_test_headline])

# Logistic Regression model training
model = LogisticRegression(max_iter=1000, random_state=7)
model.fit(X_train_combined, y_train)

# Generating predictions
y_pred = model.predict(X_test_combined)

# Calculating Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Generating classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Model Accuracy: 0.6039
Classification Report:
                precision    recall  f1-score   support

          ARTS       0.34      0.24      0.28       290
ARTS & CULTURE       0.39      0.24      0.30       254
  BLACK VOICES       0.49      0.40      0.44       909
      BUSINESS       0.48      0.46      0.47      1237
       COLLEGE       0.52      0.34      0.41       234
        COMEDY       0.50      0.46      0.48      1087
         CRIME       0.54      0.55      0.54       729
CULTURE & ARTS       0.60      0.25      0.36       232
       DIVORCE       0.82      0.69      0.75       695
     EDUCATION       0.41      0.29      0.34       199
 ENTERTAINMENT       0.62      0.74      0.67      3508
   ENVIRONMENT       0.43      0.26      0.32       289
         FIFTY       0.34      0.16      0.22       280
  FOOD & DRINK       0.65      0.72      0.68      1275
     GOOD NEWS       0.42      0.27      0.33       270
         GREEN       0.39      0.34      0.36       539
H