# Data Science Assessment: News Article Analysis & Classification

### Data Preprocessing & Storage

In [54]:
!pip install datasets



In [43]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
from datasets import load_dataset
import pandas as pd

# Load dataset
dataset = load_dataset("fancyzhx/ag_news")

# Convert to Pandas DataFrame
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])


# Verify column names
print("Columns in dataset:", df_train.columns)

df_train.columns=["Title","label"]
df_test.columns=["Title","label"]

# If "text" column is missing, create it by merging "Title"
if "text" not in df_train.columns:
    df_train["text"] = df_train["Title"]
    df_test["text"] = df_test["Title"]

# Display dataset sample
print(df_train.head())


Columns in dataset: Index(['text', 'label'], dtype='object')
                                               Title  label  \
0  Wall St. Bears Claw Back Into the Black (Reute...      2   
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2   
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2   
3  Iraq Halts Oil Exports from Main Southern Pipe...      2   
4  Oil prices soar to all-time record, posing new...      2   

                                                text  
0  Wall St. Bears Claw Back Into the Black (Reute...  
1  Carlyle Looks Toward Commercial Aerospace (Reu...  
2  Oil and Economy Cloud Stocks' Outlook (Reuters...  
3  Iraq Halts Oil Exports from Main Southern Pipe...  
4  Oil prices soar to all-time record, posing new...  


### Understand structure


In [66]:
# Check dataset information
df_train.info()

#Check unique labels
print("-----------------------------------------------------------------")
print("Unique Labels:", df_train["label"].unique())

print("-----------------------------------------------------------------")
# check distribution
df_train["label"].map(category_map).value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Title   120000 non-null  object
 1   label   120000 non-null  int64 
 2   text    120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB
-----------------------------------------------------------------
Unique Labels: [2 3 1 0]
-----------------------------------------------------------------


Business        30000
Science/tech    30000
Sports          30000
World           30000
Name: label, dtype: int64

### Cleaning the pipeline

#### This cleaning pipeline method is used for cleaning the unnecessary spaces and special characters in the title or description of the data


In [68]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])  # Remove stopwords
    return text

# Apply cleaning function
df_train["clean_text"] = df_train["text"].apply(clean_text)
df_test["clean_text"] = df_test["text"].apply(clean_text)

# Display cleaned text
print(df_train[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  Wall St. Bears Claw Back Into the Black (Reute...   
1  Carlyle Looks Toward Commercial Aerospace (Reu...   
2  Oil and Economy Cloud Stocks' Outlook (Reuters...   
3  Iraq Halts Oil Exports from Main Southern Pipe...   
4  Oil prices soar to all-time record, posing new...   

                                          clean_text  
0  wall st bears claw back black reuters reuters ...  
1  carlyle looks toward commercial aerospace reut...  
2  oil economy cloud stocks outlook reuters reute...  
3  iraq halts oil exports main southern pipeline ...  
4  oil prices soar time record posing new menace ...  
