In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('news_dataset.csv')
df.shape

(115233, 4)

In [3]:
df.label.value_counts()

economics        9308
tech             9293
business         9258
finance          9167
beauty           9130
entertainment    9028
food             8876
sports           8876
politics         8719
science          8582
travel           8107
world            6211
health           5557
environment      5121
Name: label, dtype: int64

In [7]:
def check_for_noise(df):
    # create a copy
    df = df.copy()
    df = df.fillna(value='', axis=0)
    # create a set for adding all the useless symbols(noise) 
    symbol = set()
    # parse through the data to extract all the unnecessary symbols.
    for col in list(df.columns):
        ### The regular expression states to find all the letters other than words and spaces.
        noise = df[col].str.findall(r'([^\w\s\d]+)')
        for val in noise:
            for sy in val:
                symbol.add(sy)
    # convert back to list.
    symbol = list(symbol)
    return symbol


noises = check_for_noise(df)
print(len(noises))
print(noises[:100])


2361
["ി'", '´.', ',\'"…', '###', ',-', '":"\\/\\/', ';-)', "***':", ')[', '€.', ',†,', '}$', '/€', "，'", ')\u2060', 'ۗ', '\u200c,', '💄', '💗', '‘£', '👻', '.¨', '%…"', "!.'", '….', '@.', '={"', '}},"', 'ٰ', '️', ')—', '🖥️', '∂', '👏🏻👏🏻👏🏻👏🏻👏🏻', '💙🙏🏼', '€(', '💪#', '(€)', '\u200b\u200b-', ',(', 'ി', '."—', '）、', '😘', "—'", '"£', "…,''", '"‘', '.–', '–$', '>@', '£/€', 'ൊ', "!—'", "˜'*°•.", '®(', '?!!!!', '®),', '👊#', '.\x92', '!),"', 'ैं.', '🍯', '!""', '📞', '),"', "']/", '°,', '{{#', '🇮🇹', "\\'", ']?"', ']".', '":"//', "'..", '❤️🏴\U000e0067\U000e0062\U000e0077\U000e006c\U000e0073\U000e007f@', '🇮🇹🇮🇹🇮🇹', '🙌(', '💡📸', '+}(', ':™', '🔴#', '(!!!!)', '().', '----------', '🇺🇦', '💌', '🇹🇷', ":@?'", '。', '.)-', '...............................................................................', '🤣🤣🤣🤣', '✅', '@?=:?', '۔@', '…….', '.(', '➡', "+'\\\\'+"]


In [8]:
def preprocess_inputs(df):
    
    # Make a copy of dataset for preprocessing.
    df = df.copy()

    # fill all empty values with blank.
    df = df.fillna(value='', axis=0)

    # clean the data from useless noises
    df['excerpt'] = df['excerpt'].str.replace(r'([^A-Za-z\s]+)', ' ')
    df['summary'] = df['summary'].str.replace(r'([^A-Za-z\s]+)', ' ')
    df['title'] = df['title'].str.replace(r'([^A-Za-z\s]+)', ' ')

    # remove stopwords from the columns
    f = lambda x: ' '.join([item.lower() for item in x.split(' ') if len(item) >1 and item.lower() not in stop_words])
    stop_words = set(stopwords.words('english'))
    df['title'] =   df['title'].apply(f)
    df['excerpt'] = df['excerpt'].apply(f)
    df['summary'] = df['summary'].apply(f)
    
    return df


In [9]:
X = preprocess_inputs(df)

In [13]:
X.head()

Unnamed: 0,title,excerpt,summary,label
0,veteran broadcaster dave allen joins par mar s...,longtime tri state broadcaster media personali...,press releases posted last hours last days mar...,economics
1,high performance computing center stuttgart ce...,stuttgart germany oct founded germany first na...,stuttgart germany oct founded germany first na...,environment
2,hopes new tech could finally find mh,new technology could finally solve mystery mis...,world biggest aviation mystery experts hope te...,travel
3,married first sight alexis economou looks cosy...,mafs uk star showed toned figure casual black ...,published edt october updated edt october anno...,entertainment
4,syncthink eye tracking vr receives second fda ...,eye sync uses eye tracking evaluate brain func...,eye sync virtual reality system neurotechnolog...,economics
