# DSL Project - Category News Detection

In [1]:
import pandas as pd
import numpy as np

In [None]:
news_db = pd.read_csv("/Users/riccardocervero/Desktop/news.csv")
news = news_db[news_db.section.notna()]
news = news[news.title.notna()]
news = news[news.content.notna()]
news=news.drop(['year','month','url'],axis=1)

In [7]:
news = news.reset_index()
news = news[['date','title','content','author','publication','section','category','digital']]
news.head(1)

Unnamed: 0,date,title,content,author,publication,section,category,digital
0,2015-12-31,Fire tears through Dubai luxury hotel near pla...,"[Sign in to comment!, More than a dozen people...",,Fox News,world,broadcast,3.0


In [9]:
for i in news.publication.unique():
    print(i,"has an occurrence of",(len(news[news.publication==i])/len(news))*100,"%")

Fox News has an occurrence of 6.136284053505148 %
Talking Points Memo has an occurrence of 7.454303522655648 %
Buzzfeed News has an occurrence of 14.985643252328595 %
National Review has an occurrence of 8.765319700259122 %
New York Post has an occurrence of 23.924644582953988 %
Guardian has an occurrence of 12.161916100567266 %
NPR has an occurrence of 10.908326913649415 %
Los Angeles Times has an occurrence of 0.04342040759156804 %
Washington Post has an occurrence of 15.620141466489251 %


In [14]:
for i in news.category.unique():
    print(i,"has an occurrence of",(len(news[news.category==i])/len(news))*100,"%")

broadcast has an occurrence of 6.136284053505148 %
general has an occurrence of 22.439946774984243 %
nan has an occurrence of 0.0 %
newspaper has an occurrence of 27.825477974648084 %
radio has an occurrence of 10.908326913649415 %


In [15]:
news.groupby(['publication','category'])['publication','category'].size()

publication          category 
Buzzfeed News        general      10699
Fox News             broadcast     4381
Guardian             newspaper     8683
Los Angeles Times    newspaper       31
NPR                  radio         7788
Talking Points Memo  general       5322
Washington Post      newspaper    11152
dtype: int64

As we can see, every 'publication' has a specific channel to broadcast its contents through.

In [21]:
news.groupby('section').size().head(10)

section
13.7                   214
2014                     2
2015                    39
2016                  8569
2017                  8472
ablogsupreme             1
adambvary               15
adamserwer              18
adolfoflores            83
adriancarrasquillo      78
dtype: int64

There could be a problem on some rows:

In [24]:
news[news.section=='2015'].head(1)

Unnamed: 0,date,title,content,author,publication,section,category,digital
35580,2015-12-28,Demand for these tiny NYC apartments is off th...,AP Photo AP Photo AP Photo AP Photo AP Photo V...,Lindsay Putnam,New York Post,2015,,1.0


In [43]:
pd.Series(news[news.section == '2014'] ['publication']).unique()

array(['New York Post'], dtype=object)

In [44]:
pd.Series(news[news.section == '2015'] ['publication']).unique()

array(['New York Post'], dtype=object)

In [45]:
pd.Series(news[news.section == '2016'] ['publication']).unique()

array(['Fox News', 'New York Post'], dtype=object)

In [50]:
len(news[(news.section == '2016') & (news.publication == 'Fox News')])

1

In [46]:
pd.Series(news[news.section == '2017'] ['publication']).unique()

array(['New York Post'], dtype=object)

In conclusion, the section named as a year belong to New York Post, except one observation belonging to Fox Sport. Looking at content and title of these observation, they seem to belong to a unspecified "news" category:

In [61]:
news[news.section=='2014']['title'].loc[27456]

'Hotel hermit got $17M to make way for 15 Central Park\xa0West'

In [65]:
news[news.section=='2015']['title'].loc[35796]

'Pregnant CNN anchor faints live on\xa0air'

In [67]:
news[news.section=='2016']['title'].loc[26660]

'Gunman dressed as Santa Claus kills dozens at Istanbul\xa0nightclub'

In [68]:
news[(news.section=='2016')&(news.publication=='Fox News')]['title']

2463    O'Reilly: A Lesson for Donald Trump
Name: title, dtype: object

In [71]:
news[news.section=='2017']['title'].loc[26731]

'Riders say Second Avenue Subway already has\xa0issues'

Another problem could be with '13.7':

In [73]:
pd.Series(news[news.section == '13.7'] ['publication']).unique()

array(['NPR'], dtype=object)

In [78]:
news[news.section == '13.7']['title']

52448    The Stories NPR One Listeners Loved The Most I...
52460    Adopting A Sci-Fi Way Of Thinking About The Fu...
52483             Video: Swooping Starlings In Murmuration
52563     Personal-Productivity Efforts Are Doomed To Fail
52610                                How Smart Are Horses?
52624                   Philosophy In The Octopus's Garden
52681    Why Hell Can't Freeze Over: Quantum Physics An...
52748     Fact Check: Science And The Trump Administration
52828    The Science Of Gender: No, Men Aren't From Mar...
52864    Can We Learn Something About Jet Lag From MLB ...
52884                          What Makes Science Science?
52899                Taking Stock In The Search For Aliens
52921                       Reality Is Not What We Can See
52938         A New View Into The Primate Birthing Process
52961     Why Do Many Think Human Blood Is Sometimes Blue?
52992     3 Things You Might Not Know About Charles Darwin
53036    Math To The Stars: The Ladies Of 'Hidden Figure

This section almost always talks about science, thus we create the class 'science':

In [79]:
news.section = news.section.replace('13.7','Science')

Now we can search another section talking about 'Science' with regular expression:

In [98]:
import re
science_list = []
for x in news.section.unique(): 
    if re.findall("(?i).*(scien).*",x):
        science_list.append(x)
science_list = [e for e in science_list if e not in ['Science']]

In [99]:
science_list

['science']

In [100]:
news.section = news.section.replace('Science','science')

Then, it is useful to aggregate science class with all articles talking about medicine and technologies.

In [212]:
th = []
for x in news.section.unique(): 
    if re.findall("(?i).*(tech).*",x):
        th.append(x)
th.append('science')

In [213]:
for i in th:
    news.section = news.section.replace(i,"science & tech")

Since most of section are names of an author, thus, to aggregate, we decide to visually select only the topics that do not seem a name:

In [214]:
topics = ['world', 'politics', 'us', 'markets', 'opinion', 'entertainment',
       'sports', 'science & tech', 'health', 'travel', 'tech', 'leisure',
       '2016', 'auto', 'transcript', 'weather', 'food-drink',
       'livewire', 'news', 'edblog', 'world-news', 'cafe', 'dc',
       'muckraker', 'fivepoints', 'polltracker', 'this-chart', 'skarlan',
          'mbvd','sapna', 'passantino','article',
       '2017', '2014', '2015', 'commentisfree', 'sport', 'us-news',
       'artanddesign', 'football', 'lifeandstyle', 'environment', 'film',
       'technology', 'books', 'culture', 'global', 'uk-news',
       'society', 'music', 'business', 'money',
       'how-to-solve-a-murder-a-detectives-dilemma', 'tv-and-radio',
       'cities', 'info', 'inequality', 'personal-investments', 'fashion',
       'australia-news', 'global-development', 'media', 'education',
       'stage', 'canneslions', 'sustainable-business',
       'hawaii-for-foodies', 'law', 'theguardian', 'membership',
       'global-development-professionals-network', 'health-revolution',
       'defining-moment', 'progress-personified',
       'flight-centre-journeys', 'sustainable-connections',
       'media-network', 'theobserver', 'thetwo-way',
       'health-shots', 'ed', 'thesalt', 'alltechconsidered',
       'goatsandsoda', 'deceptivecadence', 'monkeysee', 'allsongs',
       '13.7', 'parallels', 'codeswitch', 'altlatino', 
       'world-cafe', 'therecord', 'itsallpolitics', 'npr-history-dept',
       'thetorch', 'ablogsupreme', 'nation',
       'local', 'food', 'opinions', 'national', 'lifestyle', 'blogs',
       'posteverything', 'cars', 'powerpost', 'goingoutguide',
       'investigations', 'outlook', 'express', 'realestate']

We can aggregate by:

LAW & POLITICS, articles and comments about US or foreign politics, and law.

In [161]:
l_p = ['politics','opinion','livewire','article','commentisfree','edblog','cafe', 'dc',
       'muckraker', 'fivepoints', 'polltracker', 'this-chart','transcript',
       'itsallpolitics','opinions','inequality','canneslions','nation','blogs',
       'law','posteverything','membership','outlook','powerpost','theobserver','codeswitch']

In [162]:
for i in l_p:
    news.section = news.section.replace(i,"law & politics")

NEWS, every article or comment giving informations about a contemporary event, and about weather.

In [216]:
import re
news_list = []
for x in news.section.unique(): 
    if re.findall("(?i).*(new).*",x):
        news_list.append(x)
news_list = [e for e in news_list if e not in ['hazelnewlevant']]

In [217]:
news_list.extend(['passantino','mbvd','national','local','thetwo-way','parallels','world','us','weather','2016','2017', '2014', '2015'])

In [218]:
news_list

['news',
 'world-news',
 'buzzfeednews',
 'us-news',
 'uk-news',
 'australia-news',
 'passantino',
 'mbvd',
 'national',
 'local',
 'thetwo-way',
 'parallels',
 'world',
 'us',
 'weather',
 '2016',
 '2017',
 '2014',
 '2015']

In [219]:
for i in news_list:
    news.section = news.section.replace(i,"news")

FOOD

In [220]:
import re
food_list = []
for x in news.section.unique(): 
    if re.findall("(?i).*(food).*",x):
        food_list.append(x)

In [221]:
food_list.extend(['thesalt','express'])
food_list

['food-drink', 'hawaii-for-foodies', 'food', 'thesalt', 'express']

In [222]:
for i in food_list:
    news.section = news.section.replace(i,"food")

ECONOMY, BUSINESS & JOB: every comments referred to the economic world and job sector.

In [223]:
import re
e_b = []
for x in news.section.unique(): 
    if re.findall("(?i).*(econom|business).*",x):
        e_b.append(x)

In [224]:
e_b.extend(['sapna','global-development','sustainable-connections','realestate','global-development-professionals-network','personal-investments','money','markets','leisure'])
e_b

['business',
 'sustainable-business',
 'sapna',
 'global-development',
 'sustainable-connections',
 'realestate',
 'global-development-professionals-network',
 'personal-investments',
 'money',
 'markets',
 'leisure']

In [225]:
for i in e_b:
    news.section = news.section.replace(i,"economy, business & jobs")

HEALTH & LIFESTYLE: every comment referred to health, lifestyle and environment.

In [226]:
h = []
for x in news.section.unique(): 
    if re.findall("(?i).*(healt|life|envir).*",x):
        h.append(x)
h.append('goatsandsoda')
h

['health',
 'lifeandstyle',
 'environment',
 'health-revolution',
 'health-shots',
 'lifestyle',
 'goatsandsoda']

In [227]:
for i in h:
    news.section = news.section.replace(i,"health & lifestyle")

SPORTS: everything about sports and cars

In [228]:
sport = []
for x in news.section.unique(): 
    if re.findall("(?i).*(sport|auto).*",x):
        sport.append(x)
sport.extend(['football','cars','thetorch'])
sport

['sports', 'auto', 'sport', 'football', 'cars', 'thetorch']

In [229]:
for i in sport:
    news.section = news.section.replace(i,"sports")

TRAVEL

In [230]:
travel = []
for x in news.section.unique(): 
    if re.findall("(?i).*(trav|journey|cit).*",x):
        travel.append(x)
travel

['travel', 'cities', 'flight-centre-journeys']

In [231]:
for i in travel:
    news.section = news.section.replace(i,"travel")

ART & FASHION about art, design, fashion.

In [232]:
af = []
for x in news.section.unique(): 
    if re.findall("(?i).*(fash|design).*",x):
        af.append(x)
af

['artanddesign', 'fashion']

In [233]:
for i in af:
    news.section = news.section.replace(i,"art & fashion")

UNIVERSITY & SCHOOL

In [234]:
ed = ['education','ed']
for i in ed:
    news.section = news.section.replace(i,"university & school")

CRIME, about investigations and reports.

In [235]:
inv = ['how-to-solve-a-murder-a-detectives-dilemma','investigations']
for i in inv:
    news.section = news.section.replace(i,"crime")

ENTERTAINMENT, about music, movies, tv, radio.

In [236]:
ent = ['entertainment','film','global','music','ablogsupreme','therecord','altlatino',
       'allsongs','goingoutguide','monkeysee','world-cafe','stage','deceptivecadence',
       'tv-and-radio','tellshow']
for i in ent:
    news.section = news.section.replace(i,"enterteitment")

In [259]:
news.section = news.section.replace("enterteitment",'entertainment')

LGBT, comment about LGBT community.

In [237]:
for i in ['hazelnewlevant','skarlan']:
    news.section = news.section.replace(i,"LGBT")

BOOKS: 'books'

CULTURE: 

In [238]:
culture = ['culture', 'society','info','defining-moment','npr-history-dept','progress-personified','ikrd']
for i in culture:
    news.section = news.section.replace(i,"culture")

MEDIA, about mass media and social network.

In [239]:
media = ['media','theguardian','media-network']
for i in media:
    news.section = news.section.replace(i,"media")

In [260]:
cat = ['news', 'law & politics', 'economy, business & jobs',
       'entertainment', 'sports', 'science & tech', 'health & lifestyle',
       'travel','food', 'LGBT', 'books', "media", "culture",'university & school','crime','art & fashion']

In [261]:
not_cat = [e for e in news.section.unique() if e not in cat]

In [262]:
for i in not_cat:
    news.section = news.section.replace(i,np.nan)

In [263]:
news.section.unique()

array(['news', 'law & politics', 'economy, business & jobs',
       'entertainment', 'sports', 'science & tech', 'health & lifestyle',
       'travel', nan, 'food', 'LGBT', 'culture', 'art & fashion', 'books',
       'crime', 'media', 'university & school'], dtype=object)

In [300]:
n = news[news.section.notna()]

In [301]:
len(n)

61211

In [302]:
n=n[['date','title','content','author','publication','section','category','digital']]

Now, we have to remove duplicates:

In [303]:
n.drop_duplicates(keep='first',inplace=True)

In [304]:
len(n)

61211

Thus, there are no rows equal in all the columns, but, maybe, only in "content".

In [305]:
n.drop_duplicates(subset='content',keep='first',inplace=True)

In [306]:
len(n)
print("There were",61211-len(n),"identical articles in the database.")

There were 556 identical articles in the database.


Then, clean the text:

In [326]:
n1 = n
n1 = n1[n1.title.notna()]
n1 = n1[n1.content.notna()]

1) Convert text to lowercase:

In [329]:
def lowerization(x):
    return x.lower()

In [331]:
n1.title = n1.title.apply(lowerization)

In [334]:
n1.content = n1.content.apply(lowerization)

2) Remove numbers:

In [338]:
import re
def rem_num(x):
    return re.sub(r'\d+', '', x)

In [340]:
n1.title = n1.title.apply(rem_num)
n1.content = n1.content.apply(rem_num)

3) Remove escape and special characters and punctuation:

In [378]:
def rem(x):
    return re.sub(re.compile('[\W_]+', re.UNICODE)," ",x)

In [381]:
n1.title = n1.title.apply(rem)
n1.content = n1.content.apply(rem)

4) Remove stop words

In [397]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
stop_words = set(ENGLISH_STOP_WORDS)

In [406]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/riccardocervero/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [408]:
from nltk.tokenize import word_tokenize

In [423]:
def rem_stop(x):
    tokens = word_tokenize(x)
    result = [i for i in tokens if not i in stop_words]
    return re.sub(re.compile('[\W_]+', re.UNICODE)," ",str(result)).strip()

In [425]:
n1.title = n1.title.apply(rem_stop)
n1.content = n1.content.apply(rem_stop)

5) Lemmatization

In [8]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
def lem(x):
    doc = nlp(x)
    return " ".join([token.lemma_ for token in doc]).replace("-PRON- ","")

In [22]:
n1.title = n1.title.apply(lem)
n1.content = n1.content.apply(lem)

6) Removing "Sign-comment" from content: it has been noticed that many contents from Fox News starts with the sentence "Sign in to comment!".

In [14]:
n2 = n1

In [65]:
import re
def rem_sc(x):
    if re.findall("(?i).*(^sign comment).*",x):
        return x.replace("sign comment","").strip()

In [86]:
n2.content[n2.publication=='Fox News'] = n2.content[n2.publication=='Fox News'].apply(rem_sc)

In [87]:
n2.head()

Unnamed: 0,date,title,content,author,publication,section,category,digital
0,2015-12-31,tear dubai luxury hotel near plan firework,dozen people hurt massive broke luxury hotel d...,,Fox News,news,broadcast,3.0
1,2015-12-31,carson campaign staffer resign,republican presidential candidate ben carson s...,,Fox News,law & politics,broadcast,3.0
2,2015-12-31,democrat martin o malley fail qualify ohio s p...,democratic presidential candidate martin o mal...,,Fox News,law & politics,broadcast,3.0
3,2015-12-30,brawl break gop race trump tier,nasty battle break republican presidential fie...,,Fox News,law & politics,broadcast,3.0
4,2015-12-30,donald trump blast clinton great abuser world,donald trump launch new attack hillary clinton...,Christopher Snyder,Fox News,law & politics,broadcast,3.0


In [97]:
n2.to_csv("/Users/riccardocervero/Desktop/news_preprocessed.csv")

A final descriptive analysis:

In [91]:
n2.groupby('section').size().sort_values(ascending=False)

section
news                        32684
law & politics              18293
health & lifestyle           2806
entertainment                1843
sports                       1470
science & tech               1210
food                          681
economy, business & jobs      413
university & school           345
culture                       228
books                         190
art & fashion                 175
travel                         97
media                          83
LGBT                           48
crime                          39
dtype: int64

In [92]:
n2.groupby(['publication','section']).size()

publication          section                 
Buzzfeed News        LGBT                           48
                     culture                         4
                     economy, business & jobs       15
                     entertainment                   1
                     news                          154
Fox News             economy, business & jobs       42
                     entertainment                 168
                     food                            1
                     health & lifestyle             85
                     law & politics               2372
                     news                         1384
                     science & tech                177
                     sports                        104
                     travel                         20
Guardian             art & fashion                 175
                     books                         190
                     crime                           2
                   

Sampling with same frequency for section and possibly for publication.

In [None]:
n4 = n3

In [22]:
n_s = pd.DataFrame() 
for i in n4.section.unique():
    for j in n4.publication.unique():
        n_s=n_s.append(n4[(n4.section == i)&(n4.publication == j)].sample(round(len(n4[(n4.section == i)&(n4.publication == j)])*0.3))) 

In [29]:
n_s = n_s.reset_index()
n_s = n_s[['date','title','content','author','publication','section','category','digital']]
n_s.head()

Unnamed: 0,date,title,content,author,publication,section,category,digital
0,2016-03-29,video coast guard s million cocaine bust panam...,northern california u s coast guard crew seize...,,Fox News,news,broadcast,3.0
1,2016-07-01,escape isis yazidi woman recount life sex slave,isis lay siege yazidi village dot iraq s mount...,Hollie McKay,Fox News,news,broadcast,3.0
2,2016-06-17,gay friendly beach town bar cautious wake orla...,official east coast beach town popular lgbt co...,,Fox News,news,broadcast,3.0
3,2016-06-18,dc school teach obama daughter ban redskin clo...,prestigious high school teach president obama ...,,Fox News,news,broadcast,3.0
4,2016-03-22,american wound belgium attack official say,official reveal detail tuesday american wound ...,,Fox News,news,broadcast,3.0


In [30]:
n_s.to_csv("/Users/riccardocervero/Desktop/news_sample.csv")