# Problem Statement:
In today’s world, data is power. With News companies having terabytes of data stored in
servers, everyone is in the quest to discover insights that add value to the organization.
With various examples to quote in which analytics is being used to drive actions, one that
stands out is news article classification.
Nowadays on the Internet there are a lot of sources that generate immense amounts of
daily news. In addition, the demand for information by users has been growing
continuously, so it is crucial that the news is classified to allow users to access the
information of interest quickly and effectively. This way, the machine learning model for
automated news classification could be used to identify topics of untracked news and/or
make individual suggestions based on the user’s prior interests.

## EDA

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords
# import gensim
import wordcloud
import textblob
# import spacy
# import textstat
# import pyLDAvis

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/v/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/v/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("/home/v/news-article-classification/source/train/train.csv", names=['articleid', 'text', 'category'])
# df.columns = [column.lower() for column in df.columns]
df.head()
# df.columns

Unnamed: 0,articleid,text,category
0,ArticleId,Text,Category
1,1833,worldcom ex-boss launches defence lawyers defe...,business
2,154,german business confidence slides german busin...,business
3,1101,bbc poll indicates economic gloom citizens in ...,business
4,1976,lifestyle governs mobile choice faster bett...,tech


In [4]:
if df['articleid'].astype(str).str.isnumeric().all():
    # Convert the values in the column to integer
    df['articleid'] = df['articleid'].astype(int)

In [6]:
df['articleid'] = pd.to_numeric(df['articleid'])

ValueError: Unable to parse string "ArticleId" at position 0

In [1]:
if df['articleid'].astype(str).str.isnumeric().all():
    # Convert the values in the column to integer
    df['articleid'] = df['articleid'].astype(int)

NameError: name 'df' is not defined

In [55]:
df['articleid'] = df['articleid'].astype(int)

ValueError: invalid literal for int() with base 10: 'ArticleId'

In [18]:
df = pd.read_csv('/home/v/news-article-classification/artifacts/02_04_2023_20_58_26/data_ingestion/ingested/train.csv')

In [19]:
df.dtypes

ArticleId     int64
Category     object
Text         object
dtype: object

In [15]:
from typing import List

In [34]:
def duplicate_articleids(dataframe)-> List[int]:
    duplicates = dataframe[dataframe.duplicated(subset='ArticleId')]['ArticleId'].values
    # number_of_duplicates = len(duplicates)
    return list(duplicates)

In [35]:
duplicate_articleids(df)

[1988, 252, 474]

In [78]:
import yaml
from news.utils.main_utils import read_yaml_file

yaml_content = read_yaml_file('/home/v/news-article-classification/config/schema.yaml')
# print(file['columns'])
columns = {column['name']: column['type'] for column in yaml_content['columns']}
columns = {column["name"]: column["type"] for column in yaml_content["columns"]}
columns

{'articleId': 'int', 'Text': 'object', 'Category': 'object'}

In [73]:
[column['name'] for column in yaml_content['columns']]


['ArticleId', 'Text', 'Category']

In [77]:
def column_name_validation(dataframe)-> bool:
    excpected_columns = [column['name'] for column in yaml_content['columns']]
    if set(dataframe.columns) != set(excpected_columns):
        return False
    return True


In [79]:
column_name_validation(df)

False

In [82]:
list(df['Category'].unique())

['politics', 'business', 'entertainment', 'tech', 'sport']

In [85]:
def target_label_validation(dataframe, target)-> bool:
    expected_labels = ['politics', 'business', 'entertainment', 'tech', 'sport']
    if set(dataframe[target].unique()) != set(expected_labels):
        return False
    return True

In [86]:
target_label_validation(df,'Category')

False

In [66]:
def datatype_validation(dataframe)-> bool:
        columns = {column['name']: column['type'] for column in yaml_content['columns']}
        for column, expected_type in columns.items():
            if dataframe[column].dtypes != expected_type:
                return False
        return True

In [67]:
datatype_validation(df)

False

In [43]:
# Columns and their expected data types from the YAML file
columns = {
    "ArticleId": int,
    "Text": object,
    "Category": object
}

# Create a new data frame to hold the mismatched records
mismatched = pd.DataFrame(columns=df.columns)

# Check if the data types of the columns in the data frame match the expected data types
for column, expected_type in columns.items():
    if df[column].dtypes != expected_type:
        mismatched = mismatched.append(df[column])

# Write the mismatched records to a CSV file
mismatched.to_csv("mismatched_records.csv", index=False)

In [36]:
df.drop_duplicates(subset='ArticleId', inplace=True)

In [37]:
duplicate_articleids(df)

[]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1491 entries, 0 to 1490
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   articleid  1491 non-null   object
 1   text       1491 non-null   object
 2   category   1491 non-null   object
dtypes: object(3)
memory usage: 35.1+ KB


In [4]:
def null_count(df):
    return df.isna().sum()

null_count(df)

articleid    0
text         0
category     0
dtype: int64

### Count the Stop Words

In [5]:
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['text', 'stopwords']]


Unnamed: 0,text,stopwords
0,Text,0
1,worldcom ex-boss launches defence lawyers defe...,108
2,german business confidence slides german busin...,120
3,bbc poll indicates economic gloom citizens in ...,220
4,lifestyle governs mobile choice faster bett...,276
...,...,...
1486,double eviction from big brother model caprice...,97
1487,dj double act revamp chart show dj duo jk and ...,237
1488,weak dollar hits reuters revenues at media gro...,87
1489,apple ipod family expands market apple has exp...,230


### Number of Punctuations

In [6]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['punctuations'] = df['text'].apply(lambda x: count_punct(x))

In [7]:
df[['text','punctuations']]

Unnamed: 0,text,punctuations
0,Text,0
1,worldcom ex-boss launches defence lawyers defe...,22
2,german business confidence slides german busin...,25
3,bbc poll indicates economic gloom citizens in ...,36
4,lifestyle governs mobile choice faster bett...,42
...,...,...
1486,double eviction from big brother model caprice...,22
1487,dj double act revamp chart show dj duo jk and ...,35
1488,weak dollar hits reuters revenues at media gro...,24
1489,apple ipod family expands market apple has exp...,33


### Number of Numerics

In [8]:
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['text','numerics']]

Unnamed: 0,text,numerics
0,Text,0
1,worldcom ex-boss launches defence lawyers defe...,4
2,german business confidence slides german busin...,2
3,bbc poll indicates economic gloom citizens in ...,17
4,lifestyle governs mobile choice faster bett...,4
...,...,...
1486,double eviction from big brother model caprice...,2
1487,dj double act revamp chart show dj duo jk and ...,7
1488,weak dollar hits reuters revenues at media gro...,7
1489,apple ipod family expands market apple has exp...,9


### Number of Upper Case Words

In [9]:
df['upper'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['text', 'upper']]

Unnamed: 0,text,upper
0,Text,0
1,worldcom ex-boss launches defence lawyers defe...,0
2,german business confidence slides german busin...,0
3,bbc poll indicates economic gloom citizens in ...,0
4,lifestyle governs mobile choice faster bett...,0
...,...,...
1486,double eviction from big brother model caprice...,0
1487,dj double act revamp chart show dj duo jk and ...,0
1488,weak dollar hits reuters revenues at media gro...,0
1489,apple ipod family expands market apple has exp...,0


## Data Cleaning

### Text to Lower Case

In [10]:
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['text'].head()

0                                                 text
1    worldcom ex-boss launches defence lawyers defe...
2    german business confidence slides german busin...
3    bbc poll indicates economic gloom citizens in ...
4    lifestyle governs mobile choice faster better ...
Name: text, dtype: object

### Removing Punctuations

In [11]:
df['text'] = df['text'].str.replace("[^\w\s]","")
df['text'].head()

0                                                 text
1    worldcom exboss launches defence lawyers defen...
2    german business confidence slides german busin...
3    bbc poll indicates economic gloom citizens in ...
4    lifestyle governs mobile choice faster better ...
Name: text, dtype: object

### Removing Stop Words

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['text'].head()

0                                                 text
1    worldcom exboss launches defence lawyers defen...
2    german business confidence slides german busin...
3    bbc poll indicates economic gloom citizens maj...
4    lifestyle governs mobile choice faster better ...
Name: text, dtype: object

### Remove URLs

In [13]:
import re
def remove_url(text):
    return re.sub(r'\S*https?:\S*','', text)

In [14]:
df['text'] = df['text'].apply(lambda x: remove_url(x))
df['text'].head()

0                                                 text
1    worldcom exboss launches defence lawyers defen...
2    german business confidence slides german busin...
3    bbc poll indicates economic gloom citizens maj...
4    lifestyle governs mobile choice faster better ...
Name: text, dtype: object

### Remove HTML Tags

In [15]:
def remove_html(text):
    return re.sub(r"<.*?>","", text)

In [16]:
df['text'] = df['text'].apply(lambda x: remove_html(x))
df['text']

0                                                    text
1       worldcom exboss launches defence lawyers defen...
2       german business confidence slides german busin...
3       bbc poll indicates economic gloom citizens maj...
4       lifestyle governs mobile choice faster better ...
                              ...                        
1486    double eviction big brother model caprice holb...
1487    dj double act revamp chart show dj duo jk joel...
1488    weak dollar hits reuters revenues media group ...
1489    apple ipod family expands market apple expande...
1490    santy worm makes unwelcome visit thousands web...
Name: text, Length: 1491, dtype: object

### Remove Emoji's

In [17]:
def remove_emoji(text):
    return re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', text)

In [18]:
df['text'] = df['text'].apply(lambda x: remove_emoji(x))
df['text'].head()

0                                                 text
1    worldcom exboss launches defence lawyers defen...
2    german business confidence slides german busin...
3    bbc poll indicates economic gloom citizens maj...
4    lifestyle governs mobile choice faster better ...
Name: text, dtype: object

### Spell Correction

In [19]:
# from textblob import TextBlob
# df['text'].apply(lambda x: str(TextBlob(x).correct()))

###

### Lemmatization with pos

In [20]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

def get_wordnet_pos(word):
    treebank_tag = nltk.pos_tag([word])[0][1]

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

lemmatizer = WordNetLemmatizer()
   
def lemma_clean_text(text, cores=1):
    sample = text
    sample = sample.split()
    sample = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(word.lower())) for word in sample]
    sample = ' '.join(sample)
    return sample


In [21]:
stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
def correct_text(text, stem=False, lemma=False, spell=False):
    if lemma and stem:
        raise Exception('Either stem or lemma can be true, not both!')
        return text
    
    sample = text
    
    #removing stopwords
    sample = sample.lower()
    sample = [word for word in sample.split() if not word in stops]
    sample = ' '.join(sample)
    
    if lemma:
        sample = sample.split()
        sample = [lemmatizer.lemmatize(b) for word in sample]
        sample = ' '.join(sample)
        
    if stem:
        sample = sample.split()
        sample = [ps.stem(word) for word in sample]
        sample = ' '.join(sample)
    
    if spell:
        sample = str(TextBlob(text).correct())
    
    return sample

In [22]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Define a function to convert part of speech tags to WordNet part of speech
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

# Create a WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

# Define a sample sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize and tag the sentence
tagged_tokens = pos_tag(sentence.split())

# Iterate over the tagged tokens and lemmatize each token
lemmatized_tokens = [(lemmatizer.lemmatize(token[0], pos=get_wordnet_pos(token[1])), token[1]) for token in tagged_tokens[:-1]]

# Print the original tokens, the lemmatized tokens, the part of speech tags, and the WordNet part of speech
for token in tagged_tokens[:-1]:
    print(f"Original token: {token[0]}")
    print(f"Lemmatized token: {lemmatizer.lemmatize(token[0], pos=get_wordnet_pos(token[1]))}")
    print(f"Part of speech tag: {token[1]}")
    print(f"WordNet part of speech: {get_wordnet_pos(token[1])}")
    print()


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/home/v/nltk_data'
    - '/home/v/news-article-classification/venv/nltk_data'
    - '/home/v/news-article-classification/venv/share/nltk_data'
    - '/home/v/news-article-classification/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  # Tokenize the text
  tokens = nltk.word_tokenize(text)
  
  # Add part-of-speech tags
  tagged_tokens = nltk.pos_tag(tokens)
  
  # Lemmatize the tokens
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [(lemmatizer.lemmatize(token[0], pos=get_wordnet_pos(token[1])), token[1]) for token in tagged_tokens]
  
  return lemmatized_tokens

def get_wordnet_pos(treebank_tag):
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

text = "This is a sample sentence with some lemmatization."
lemmatized_tokens = preprocess(text)
print(lemmatized_tokens)


In [None]:
df['text'] = df['text'].apply(lambda x: correct_text(x, lemma=True))

In [None]:
df

In [None]:
df['category'].value_counts().plot(kind='bar')

In [None]:
plt.figure(figsize=(4,3))
df['text'].str.len().hist()
plt.xlim([0,6500])
plt.title("Histogram of Sentence Length")
plt.show()

In [None]:
df['text'].str.split().map(lambda x: len(x))

In [None]:
plt.figure(figsize=(4,4))
plt.title("Histogram of Number of Words in Each Sentence")
df['text'].str.split().map(lambda x: len(x)).hist(bins=20)
plt.xlim([0,1200])
plt.show()

In [None]:
plt.figure(figsize=(4,4))
df['text'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x)).hist()
plt.title("Histogram of Average Word Length in Each Sentence")
plt.show()

From the Above figure we can conclude below
* Each Sentence length in a news article is ranging from 500 to 6000
* The number of words in each sentence is ranging from 100 to 1100
* Average Word length in each sentence is ranging from 3.5 to 5.5

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

def prior_eda(df, column):
    df['stopwords'] = df[column].apply(lambda x: len([x for x in x.split() if x in stop]))
    df['punctuations'] = df[column].apply(lambda x: count_punct(x))
    df['numerics'] = df[column].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    df['upper'] = df[column].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    return df    

In [None]:
import re 
from textblob import TextBlob
nltk.download('words')

def remove_url(text):
    return re.sub(r'\S*https?:\S*','', text)

def remove_html(text):
    return re.sub(r"<.*?>","", text)

def remove_emoji(text):
    return re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', text)

In [None]:
def clean_text(text):
    

In [None]:
data['cleaned_text'] = df['text'].apply(lambda x: clean_text(x))
data.head()

In [None]:
def get_wordnet_pos(word):
    treebank_tag = nltk.pos_tag([word])[0][1]
    

In [None]:
def posterior_eda(df, column):
    fig, (ax1, ax2, ax3) = plt.subplots(1,3)
    fig.suptitle("Sentence, Word, Average Word Lengths")
    ax1.hist(df[column].str.len())
    ax2.hist(df[column].str.split().map(lambda x: len(x)))
    ax3.hist(df[column].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x)))
    plt.show()

In [None]:
def preprocess(text):
    text = " ".join([word.lower() for word in text.split()])
    text = re.sub(r"\S*https?:\S*",'',text)
    text = re.sub(r"<.*?>",'',text)
    text = re.sub('[%s]' %re.escape(string.punctuation), '', text)
    text = re.sub(r'\n','',text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [None]:
df = pd.read_csv("/config/workspace/source/train/train.csv")
df.columns = [column.lower() for column in df.columns]
df['text'] = df['text'].apply(lambda x:preprocess(x))
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
X = df['text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=19)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, confusion_matrix

nb_model = MultinomialNB()
nb_model.fit(X_train,y_train)
y_pred_nb = model.predict(X_test)
confusion_matrix(y_test,y_pred_nb)


In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred_rf = model.predict(X_test)
confusion_matrix(y_test,y_pred_rf)


In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)
y_pred_xgb = model.predict(X_test)
confusion_matrix(y_test,y_pred_xgb)


In [None]:
print(f1_score(y_test,y_pred_nb, average='weighted'))
print(f1_score(y_test,y_pred_rf, average='weighted'))
print(f1_score(y_test, y_pred_xgb, average='weighted'))

In [None]:
pwd

In [None]:
import pickle
pickle.dump(nb_model,open('/config/workspace/models/nb_model.pkl','wb'))

In [None]:
y_pred = model.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
print(f"{accuracy_score(y_test,y_pred)}")

In [None]:
f1_score(y_test, y_pred, average='weighted')