#### **1. Import Necessary Modules**

In [3]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### **2. Read The Data**

In [4]:
train_path = '/home/v/news-article-classification/source/train/train.csv'
test_path = '/home/v/news-article-classification/source/test/test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print(f"Train Data --> (rows: {train_df.shape[0]}, columns: {train_df.shape[1]})")
print(f"Test Data --> (rows: {test_df.shape[0]}, columns: {test_df.shape[1]})")


Train Data --> (rows: 1490, columns: 3)
Test Data --> (rows: 735, columns: 2)


In [5]:
train_df.columns = [column.lower() for column in train_df.columns]
test_df.columns = [column.lower() for column in test_df.columns]
print(f"Train dataset Columns: {list(train_df.columns)}")
print(f"Test dataset Columns: {list(test_df.columns)}")

Train dataset Columns: ['articleid', 'text', 'category']
Test dataset Columns: ['articleid', 'text']


In [6]:
train_df.head()

Unnamed: 0,articleid,text,category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


#### **EDA**

In [7]:
import wordcloud
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords', download_dir=os.curdir)
nltk.download('wordnet', download_dir=os.curdir)
nltk.download('punkt', download_dir=os.curdir)
nltk.download('averaged_perceptron_tagger',download_dir=os.curdir)


[nltk_data] Downloading package stopwords to ....
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ....
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to ....
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to ....
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

##### **Basic Feature Extraction - 1**

###### **1. Stop Words**

In [8]:
stop = stopwords.words('english')
train_df['stopwords'] = train_df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

##### **2. Number of Punctuations**

In [9]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

train_df['punctuations'] = train_df['text'].apply(lambda x:count_punct(x))

###### Number of HashTag Characters

In [10]:
train_df['hashtags'] = train_df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

###### **3. Number of Numerics**

In [11]:
train_df['numerics'] = train_df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

###### **4. Upper Case Words**

In [12]:
train_df['upper'] = train_df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

In [13]:
train_df.head()

Unnamed: 0,articleid,text,category,stopwords,punctuations,hashtags,numerics,upper
0,1833,worldcom ex-boss launches defence lawyers defe...,business,108,22,0,4,0
1,154,german business confidence slides german busin...,business,120,25,0,2,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,220,36,0,17,0
3,1976,lifestyle governs mobile choice faster bett...,tech,276,42,0,4,0
4,917,enron bosses in $168m payout eighteen former e...,business,142,31,0,3,0


#### **Text Cleaning**

###### **1. Text to Lower Case**

In [14]:
train_df['text'][0]



In [58]:
import re
add_words = ["mr","also","would","could","say"]
stop_words = set(stopwords.words("english"))
stop_added = stop_words.union(add_words)
def clean_text(text):
    text = " ".join([x.lower() for x in text.split()])
    text = " ".join(text.replace('uk','UnitedKingdom') for text in text.split())
    text = " ".join(text.replace('us','UnitedStates') for text in text.split())
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' %re.escape(string.punctuation),'', text)
    text = re.sub(r'[^a-zA-z]?\w*\d\w','', text)
    text = re.sub(r'\S*https?:\S*','', text)
    text = re.sub(r'<.*?>','', text)
    text = re.sub(r'\n',' ', text)
    text = re.sub("[''""...“”‘’…]", '', text)
    text = ' '.join([text for text in text.split() if text not in stop_added])
    text = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', text) #emojis and symbols
    text = text.strip()
    text = ' '.join([text.strip() for text in text.split()])
    return text

In [66]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    # Map POS tag to first character used by wordnet.lemmatize()
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_with_pos(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # POS tag tokens
    tagged_tokens = nltk.pos_tag(tokens)
    # Create lemmatizer object
    lemmatizer = WordNetLemmatizer()
    # Lemmatize each token with POS tag
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]
    # Join lemmatized tokens back into text
    lemmatized_text = " ".join([token for token in lemmatized_tokens if token not in stop_added])
    return lemmatized_text

In [68]:
import nltk

nltk.download('averaged_perceptron_tagger', download_dir=os.curdir)


[nltk_data] Downloading package averaged_perceptron_tagger to ....
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [69]:
train_df['clean_text'] = train_df['text'].apply(lambda x: clean_text(x))

In [70]:
train_df['clean_text'] = train_df['clean_text'].apply(lambda x: lemmatize_with_pos(x))

In [71]:
train_df['clean_text'][4]

'enron boss payout eighteen former enron director agree settlement deal shareholder lawsuit collapse energy firm lead plaintiff university california announce news add former director pay pocket settlement put court approval next week enron go bankrupt emerge hidden hundred million dollar debt collapse firm seventh big public UnitedStates company revenue demise send shockwaves financial market dent investor confidence corporate america settlement significant hold outside director least partially personally responsible william lerach lawyer lead class action suit enron hopefully help send message corporate boardroom importance director perform legal duty add term settlement cover insurance none former director admit wrongdoing deal fourth major settlement negotiate lawyer file class action behalf enron shareholder almost three year ago far include late deal jUnitedStatest retrieve investor however late deal include former enron chief executive ken lay jeff skilling men face criminal cha

In [72]:
pd.Series(" ".join(train_df['clean_text']).split()).value_counts()[:50]

year              1852
make              1440
new               1340
people            1323
UnitedStates      1214
one               1187
take              1150
go                1130
game              1030
get               1008
time               951
last               891
first              890
two                815
world              808
film               808
come               775
government         771
show               762
UnitedKingdom      756
play               734
work               718
company            681
firm               673
give               654
tell               643
see                643
jUnitedStatest     635
number             622
well               621
win                620
service            614
best               605
back               590
want               583
plan               579
good               576
include            572
country            562
many               561
market             559
like               555
month              553
add        

In [73]:
from textblob import TextBlob

train_df['clean_text'].apply(lambda x: str(TextBlob(x).correct()))
#taking long time


KeyboardInterrupt: 