In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# News Articles Categorization
**Thomas Bohn**   --   **2023-08-25**

A report focused on modeling news artical categorization for BBC News focused on the application of natural language processing, unsupervised learning with matrxi factorization, and a comparison to supervised learning.

--  [Main Report](https://github.com/TOM-BOHN/MsDS-news-articles-categorization/blob/main/news-articles-categorization.ipynb)  --  [Github Repo](https://github.com/TOM-BOHN/MsDS-news-articles-categorization)  --  [Presentation Slides](TBD)  --  [Presentation Video](TBD) --

# 1.&nbsp;Introduction

## Python Libraries

The following python libraries are used in this notebook.

In [2]:
# File Connection and File Manipulation
import os
# from google.colab import drive

# Basic Data Science Toolkits
import pandas as pd
import numpy as np
import math
import time

# Data Vizualization
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib import ticker
import seaborn as sns

# Text Mining
import string
from textblob import TextBlob


# Data Models
import sklearn
from sklearn.ensemble import RandomForestClassifier

# Data Model Scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve

# Import Data Model Evaluations
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import make_scorer

# Cross Validation, Grid Search, and K-Fold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold



# 2.&nbsp;Data Source

In [3]:
# Input data files are available in the read-only "../input/" directory
# List all files under the input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv


In [4]:
#label file paths
path_dir = '/kaggle/input/learn-ai-bbc/'
train_path = path_dir + 'BBC News Train.csv'
test_path = path_dir + 'BBC News Test.csv'
sample_solution_path = path_dir + 'BBC News Sample Solution.csv'

In [5]:
#import data
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sample_solution = pd.read_csv(sample_solution_path)

In [6]:
# Create a function to print key table details
def quick_table_details (df_name, df, level_of_detail = 10):
  # describe the shape and column summary
  if level_of_detail >= 1:
    print('\n####', df_name, '####')
    num_rows = df.shape[0]
    num_cols = df.shape[1]
    print('number of features (columns) = ' + str(num_cols))
    print('number of observations (rows) = ' + str(num_rows))
    print('----------------------------', '\n')
  # print the datatype counts
  if level_of_detail >= 2:
    print('DataType Counts:')
    print(df.dtypes.value_counts())
    print('----------------------------', '\n')
  # print a full list of column names
  if level_of_detail >= 3:
    print('Columns:')
    print(df.columns)
    print('----------------------------', '\n')
  #  expanded table details
  if level_of_detail >= 4:
    print('Description:')
    print(df.describe(include='all'))
    print('----------------------------', '\n')
    print('Info:')
    print(df.info())
    print('----------------------------', '\n')
  #  table records preview
  if level_of_detail >= 5:
    print('Table Preview:')
    x_records = 3
    print(df.head(x_records))
    print('....')
    print(df.tail(x_records))
    print('----------------------------', '\n')
        

In [7]:
quick_table_details ('df_train', df_train, level_of_detail = 10)
quick_table_details ('df_test', df_test, level_of_detail = 10)
quick_table_details ('df_sample_solution', df_sample_solution, level_of_detail = 10)


#### df_train ####
number of features (columns) = 3
number of observations (rows) = 1490
---------------------------- 

DataType Counts:
object    2
int64     1
dtype: int64
---------------------------- 

Columns:
Index(['ArticleId', 'Text', 'Category'], dtype='object')
---------------------------- 

Description:
          ArticleId                                               Text  \
count   1490.000000                                               1490   
unique          NaN                                               1440   
top             NaN  microsoft seeking spyware trojan microsoft is ...   
freq            NaN                                                  2   
mean    1119.696644                                                NaN   
std      641.826283                                                NaN   
min        2.000000                                                NaN   
25%      565.250000                                                NaN   
50%     1112.50000

# 3.&nbsp;Exploratory Data Analysis (EDA)

# 4.&nbsp;Data Cleansing

# 5.&nbsp;Feature Engineering

# 6.&nbsp;Model: Unsupervised: None-Negative Matrix Factorization (NMF)

# 7.&nbsp;Model: Unsupervised: Singular Value Decomposition (SVD)

# 8.&nbsp; Model: Supervised: TBDsingular value decomposition (SVD)

# 9.&nbsp; Model Comparison

# 10.&nbsp; Results

# 11.&nbsp; References

**Competition**
- [] BBC News Classification: News Articles Categorization https://www.kaggle.com/competitions/learn-ai-bbc/overview

**Documentation and References**
- [] sklearn.decomposition.TruncatedSVD https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
- []
- []
- []
- []

**Other Kaggle Reports**
- [] Kaggle: bbc article classification https://www.kaggle.com/code/scrambledgabs/bbc-article-classification
- [] Kaggle: BBC News Classification https://www.kaggle.com/code/conradkleykamp/bbc-news-classification#Unsupervised-Model:-Matrix-Factorization-(NMF)
- [] Kaggle: Unsupervised - Matrix Factorization https://www.kaggle.com/code/mattison/unsupervised-matrix-factorization
- [] Kaggle: Make a Classification for BBC News BY LSTM https://www.kaggle.com/code/mohamedbakrey/make-a-classification-for-bbc-news-by-lstm