# News Classification Based on Their Headlines

### This notebook outlines the process for classifying news based on their headlines using text mining and NLP techniques.

# Import necessary libraries

In [96]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


# Load the dataset

In [97]:
# Replace 'path_to_dataset.csv' with the actual file path
dataframe = pd.read_csv('uci-news-aggregator.csv', encoding="utf8", usecols=['TITLE', 'CATEGORY'])
dataframe.columns

Index(['TITLE', 'CATEGORY'], dtype='object')

In [98]:
# Display the dataset
dataframe

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b
...,...,...
422414,Surgeons to remove 4-year-old's rib to rebuild...,m
422415,Boy to have surgery on esophagus after battery...,m
422416,Child who swallowed battery to have reconstruc...,m
422417,Phoenix boy undergoes surgery to repair throat...,m


# Data preprocessing

In [99]:
# Preprocessing
#check for missing data
if(any(dataframe.isnull().any())):
    print('Missing Data\n')
    print(dataframe.isnull().sum())
else:
    print('NO missing data')
    

NO missing data


In [100]:
# check for duplicate
if(any(dataframe.duplicated())==True):
    print('Duplicate rows found')
    print('Number of duplicate rows= ', dataframe[dataframe.duplicated()].shape[0])
    dataframe.drop_duplicates(inplace=True,keep='first')
    dataframe.reset_index(inplace=True,drop=True)
    print('Dropping duplicates\n')
    print(dataframe.shape)
else:
    print('NO duplicate data')
    

Duplicate rows found
Number of duplicate rows=  15112
Dropping duplicates

(407307, 2)


In [101]:
# download the library to for the nltk functions to use in the cleaning process
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Parameow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Parameow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Parameow\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import set_config

set_config(transform_output="pandas")

wnl = WordNetLemmatizer()


# Function for cleaning and tokenize the headline
def tokenize(doc):
    document = doc.lower()  # convert the content of the headline to lowercase
    document = re.sub(r'\d+', '',
                      document)  # remove all of the digits inside of the content (using regular expressions)
    document = document.translate(str.maketrans('', '', string.punctuation))  # remove the puntuations (, . ! # ...)
    document = document.strip()  # remove the spaces at the start and end of the headline
    return [wnl.lemmatize(token) for token in word_tokenize(document) if token not in stopwords.words('english')]
    # tokenize the headlines
    # and then filter only the words that are not in the english stopwords (words that are commonly used and give no benifits to the classifier)
    # and finally lemmatize all of the tokens


# The preprocess pipeline
preprocessor = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),  # passing custom tokenizer method for the CountVectorizer to use
    ('tfidf', TfidfTransformer()),
])

tfidf_dataset = preprocessor.fit_transform(dataframe["TITLE"].values)  # process the training dataset
# tfidf_test = preprocessor.transform(X_test.values) # process the testing dataset



# Training Model

## Label encoder

In [None]:
from tkinter.constants import Y
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
class_label = le.fit_transform(dataframe["CATEGORY"])
# list(le.classes_)
class_label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_dataset.toarray(),
    class_label,
    test_size = 0.3,
    random_state=42
)


# Decision Tree Classifier

In [None]:
dt_classifier = DecisionTreeClassifier(criterion="gini", splitter="best")
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)

# Evaluation
print("accuracy score of Decision Tree:")
print(accuracy_score(y_test, dt_predictions))


# Multinomial Naive Bayes Classifier

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)

# Evaluation
print("accuracy score of Multinomial Naive Bayes:")
print(accuracy_score(y_test, nb_predictions))


# Artificial Neural Network

In [None]:
nn_classifier = MLPClassifier()
nn_classifier.fit(X_train, y_train)
nn_predictions = nn_classifier.predict(X_test)

# Evaluation
print("accuracy score of Artificial Neural Network:")
print(accuracy_score(y_test, nn_predictions))
