In [197]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

from nltk.stem import PorterStemmer
ps = PorterStemmer()

from nltk.tokenize import word_tokenize
from googletrans import Translator

import re
import time
import string
from urllib.parse import unquote_plus

In [2]:
# Import dataset
df = pd.read_excel('News Title.xls', index_col='No')

# Preview dataset
df.head()

Unnamed: 0_level_0,News Title,Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology
2,Dov Charney's Redeeming Quality,Business
3,White God adds Un Certain Regard to the Palm Dog,Entertainment
4,"Google shows off Androids for wearables, cars,...",Technology
5,China May new bank loans at 870.8 bln yuan,Business


In [3]:
# Check the distribution of target label (Category)
df['Category'].value_counts()

Entertainment    23961
Business         17707
Technology       16776
Medical           7091
Name: Category, dtype: int64

In [4]:
# Copy the original data into another variable to be processed
processed_df = df.copy()

# Preview processed dataframe
processed_df.head()

Unnamed: 0_level_0,News Title,Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology
2,Dov Charney's Redeeming Quality,Business
3,White God adds Un Certain Regard to the Palm Dog,Entertainment
4,"Google shows off Androids for wearables, cars,...",Technology
5,China May new bank loans at 870.8 bln yuan,Business


In [5]:
# Transform categorical label into numerical for analytics
label_encoder = LabelEncoder()
processed_df['num_Category'] = label_encoder.fit_transform(df['Category'])

# All categories which are encoded by label_encoder
label_encoder.classes_

array(['Business', 'Entertainment', 'Medical', 'Technology'], dtype=object)

In [6]:
# Preview transformed category in dataframe
processed_df.head()

Unnamed: 0_level_0,News Title,Category,num_Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology,3
2,Dov Charney's Redeeming Quality,Business,0
3,White God adds Un Certain Regard to the Palm Dog,Entertainment,1
4,"Google shows off Androids for wearables, cars,...",Technology,3
5,China May new bank loans at 870.8 bln yuan,Business,0


In [7]:
# Describe News Title
processed_df['News Title'].describe()

count                                                 65535
unique                                                64981
top       The article requested cannot be found! Please ...
freq                                                     21
Name: News Title, dtype: object

In [8]:
# Get 10 most frequent data based on 'News Title'
processed_df['News Title'].value_counts().head(10)

The article requested cannot be found! Please refresh your browser or go back  ...    21
Posted by Shoaib-ur-Rehman Siddiqui                                                   11
Posted by Imaduddin                                                                   11
Business Highlights                                                                   10
Posted by Parvez Jabri                                                                10
Business Wire                                                                          7
What you need to know before markets open                                              7
(click the phrases to see a list)                                                      7
Posted by Muhammad Iqbal                                                               6
PR Newswire                                                                            6
Name: News Title, dtype: int64

Based on the preview on 10 most frequent data, we can see that it is a lot of 'odd' titles which do not make any sense to categorize it into any of the category semantically.

In [9]:
# Look up the dataset with the most frequent 'News Title'
news_title_mode = processed_df['News Title'].mode()[0]
processed_df[processed_df['News Title'] == news_title_mode]

Unnamed: 0_level_0,News Title,Category,num_Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
990,The article requested cannot be found! Please ...,Entertainment,1
4321,The article requested cannot be found! Please ...,Entertainment,1
4426,The article requested cannot be found! Please ...,Entertainment,1
5100,The article requested cannot be found! Please ...,Entertainment,1
5250,The article requested cannot be found! Please ...,Entertainment,1
9012,The article requested cannot be found! Please ...,Entertainment,1
12376,The article requested cannot be found! Please ...,Entertainment,1
13256,The article requested cannot be found! Please ...,Entertainment,1
15280,The article requested cannot be found! Please ...,Entertainment,1
17183,The article requested cannot be found! Please ...,Entertainment,1


In [10]:
# Remove the data as it will be a noise for building our model
processed_df = processed_df[processed_df['News Title'] != news_title_mode]
processed_df['News Title'].value_counts().head(10)

Posted by Imaduddin                          11
Posted by Shoaib-ur-Rehman Siddiqui          11
Business Highlights                          10
Posted by Parvez Jabri                       10
(click the phrases to see a list)             7
What you need to know before markets open     7
Business Wire                                 7
10 Things to Know for Today                   6
Posted by Muhammad Iqbal                      6
PR Newswire                                   6
Name: News Title, dtype: int64

Back on to the most frequent 'News Title', we can see that there are still several 'title' with similarities and do not correlate to any of the categories. 

In [11]:
# Look up the dataset which have 'News Title' starts with 'Posted by'
processed_df[processed_df['News Title'].str.startswith('Posted')].sort_values(by='News Title')

Unnamed: 0_level_0,News Title,Category,num_Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
45334,Posted by Abdul Ahad,Business,0
15057,Posted by Imaduddin,Business,0
29925,Posted by Imaduddin,Business,0
41840,Posted by Imaduddin,Business,0
19464,Posted by Imaduddin,Business,0
18987,Posted by Imaduddin,Business,0
14872,Posted by Imaduddin,Business,0
41446,Posted by Imaduddin,Business,0
56593,Posted by Imaduddin,Business,0
8346,Posted by Imaduddin,Entertainment,1


In [12]:
# Remove those non relevant data
processed_df = processed_df[~processed_df['News Title'].str.startswith('Posted by')]

# Preview the remaining duplicate 'News Title' with the frequencies
news_title_value_counts = processed_df['News Title'].value_counts()
duplicate_news_title_counts = news_title_value_counts[news_title_value_counts > 1]
duplicate_news_title_counts

Business Highlights                                                                   10
What you need to know before markets open                                              7
Business Wire                                                                          7
(click the phrases to see a list)                                                      7
PR Newswire                                                                            6
10 Things to Know for Today                                                            6
Business briefs                                                                        5
The Daily Dish                                                                         5
Change text size for the story                                                         5
India Morning Call-Global Markets                                                      5
Today's Dow Jones Industrial Average DJIA, Nasdaq, S&P 500 stock market  ...           4
Sara Stewart         

In [13]:
# Expand all the duplicated 'News Title'
duplicated_news_title_df = processed_df[processed_df['News Title'].isin(duplicate_news_title_counts.keys())].sort_values(by='News Title')
duplicated_news_title_df

Unnamed: 0_level_0,News Title,Category,num_Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52578,'22 Jump Street' beats 'Dragon 2' at weekend b...,Entertainment,1
61114,'22 Jump Street' beats 'Dragon 2' at weekend b...,Entertainment,1
40766,'22 Jump Street': The verdict is in,Entertainment,1
31235,'22 Jump Street': The verdict is in,Entertainment,1
24921,'Candy Crush Saga' maker nears IPO that could ...,Business,0
46378,'Candy Crush Saga' maker nears IPO that could ...,Business,0
56684,'Chicken from hell' sheds new light on bird-li...,Technology,3
53278,'Chicken from hell' sheds new light on bird-li...,Technology,3
41554,'Daily Mail' apologizes to George Clooney,Entertainment,1
48582,'Daily Mail' apologizes to George Clooney,Entertainment,1


In [14]:
# Grouping all the duplicated 'News Title' and look up to all the value of the 'Category'
duplicated_news_title_with_category_df = duplicated_news_title_df.groupby('News Title')['Category'].apply(lambda x: x.unique()).reset_index()
duplicated_news_title_with_category_df

Unnamed: 0,News Title,Category
0,'22 Jump Street' beats 'Dragon 2' at weekend b...,[Entertainment]
1,'22 Jump Street': The verdict is in,[Entertainment]
2,'Candy Crush Saga' maker nears IPO that could ...,[Business]
3,'Chicken from hell' sheds new light on bird-li...,[Technology]
4,'Daily Mail' apologizes to George Clooney,[Entertainment]
5,'Game Of Thrones' Actor Is Retiring After Sund...,[Entertainment]
6,'Goonies' director: We're doing a sequel,[Entertainment]
7,'Guardians' blasts Marvel in a different direc...,[Entertainment]
8,'How I Met Your Mother' finale: Here's hoping ...,[Entertainment]
9,'Justice' is served with another helping of Su...,[Entertainment]


In [15]:
# Check if there is any duplicated 'News Title' have multiple 'Category' -> indicating label inconsistency / ambiguity
duplicated_news_title_with_multiple_category_df = duplicated_news_title_with_category_df[duplicated_news_title_with_category_df['Category'].str.len() > 1]
duplicated_news_title_with_multiple_category_df

Unnamed: 0,News Title,Category
22,(click the phrases to see a list),"[Entertainment, Business, Medical, Technology]"
24,10 Things to Know for Today,"[Entertainment, Business, Medical]"
28,5 things you need to know Monday,"[Technology, Medical]"
31,AP News in Brief at 5:58 am EDT,"[Business, Medical]"
37,Amazon escalates standoff with publisher Hachette,"[Entertainment, Business]"
39,Amazon snares classic shows in deal with HBO,"[Business, Entertainment]"
62,Breaking news,"[Business, Technology]"
64,Briefcase,"[Business, Technology]"
67,Business Highlights,"[Technology, Business]"
69,Business Wire,"[Technology, Business]"


In [16]:
# Remove those duplicated news title which have multiple categories (assuming the case for single label classification)
processed_df = processed_df[~processed_df['News Title'].isin(duplicated_news_title_with_multiple_category_df['News Title'])]
processed_df.sort_values(by='News Title')

Unnamed: 0_level_0,News Title,Category,num_Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
58784,#AmazonBasket offers UK customers the chance t...,Business,0
64931,#AskJeffTech | Windows XP questions answered,Technology,3
64304,#AskThicke: Robin Thicke Trend Gets Trolled BI...,Entertainment,1
43001,#BBCtrending: Texas teen in Facebook safari sh...,Entertainment,1
49407,#CancelColbert Trending on Twitter After Comed...,Entertainment,1
3022,#CancelColbert trending after 'Colbert Report'...,Entertainment,1
62255,#CancelColbert: Stephen Colbert under fire ove...,Entertainment,1
29870,#CancelColbert: Stephen Colbert's Contextless ...,Entertainment,1
6999,#CancelColbert? Beyond Dichotomies,Entertainment,1
9390,#CancelColbert? He's game,Entertainment,1


In [190]:
def clean_and_tokenize(sentence):
    # Convert to lowercase and split the sentence into words
    words = sentence.lower().split()
    
    # Complete the punctuations needed to be removed from words
    punctuations = string.punctuation + '“”'
    
    # Remove stopwords from words and any letter in word contains punctuations
    words = [w.translate(w.maketrans('', '', punctuations)) for w in words if (not w in stops)]
    
    # Check for stopwords again, get only alphaberts and convert into stem words
    words = [ps.stem(w) for w in words if (not w in stops) and w.isalpha()]
    
    return words

In [191]:
clean_and_tokenize('“Young blood” reverses aging in older mice')

['young', 'blood', 'revers', 'age', 'older', 'mice']

In [194]:
processed_df['tokenized_Title'] = processed_df['News Title'].apply(lambda title: clean_and_tokenize(title))
processed_df.sort_values(by='News Title')

Unnamed: 0_level_0,News Title,Category,num_Category,tokenized_Title
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
58784,#AmazonBasket offers UK customers the chance t...,Business,0,"[amazonbasket, offer, uk, custom, chanc, shop,..."
64931,#AskJeffTech | Windows XP questions answered,Technology,3,"[askjefftech, window, xp, question, answer]"
64304,#AskThicke: Robin Thicke Trend Gets Trolled BI...,Entertainment,1,"[askthick, robin, thick, trend, get, troll, bi..."
43001,#BBCtrending: Texas teen in Facebook safari sh...,Entertainment,1,"[bbctrend, texa, teen, facebook, safari, showd..."
49407,#CancelColbert Trending on Twitter After Comed...,Entertainment,1,"[cancelcolbert, trend, twitter, comedi, centra..."
3022,#CancelColbert trending after 'Colbert Report'...,Entertainment,1,"[cancelcolbert, trend, colbert, report, tweet,..."
62255,#CancelColbert: Stephen Colbert under fire ove...,Entertainment,1,"[cancelcolbert, stephen, colbert, fire, racist..."
29870,#CancelColbert: Stephen Colbert's Contextless ...,Entertainment,1,"[cancelcolbert, stephen, colbert, contextless,..."
6999,#CancelColbert? Beyond Dichotomies,Entertainment,1,"[cancelcolbert, beyond, dichotomi]"
9390,#CancelColbert? He's game,Entertainment,1,"[cancelcolbert, he, game]"


In [195]:
processed_df.sort_values(by='tokenized_Title')

Unnamed: 0_level_0,News Title,Category,num_Category,tokenized_Title
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19906,We are OVER IT!,Entertainment,1,[]
26655,Predicting+the+Palme+d%27Or+a+guessing+game,Entertainment,1,[]
23315,3/29 over coffee::1,Medical,2,[]
27477,Study%3A+US+Alzheimer%27s+rate+seems+to+be+dro...,Medical,2,[]
47799,Heads+up%21+Supermoon+is+here,Technology,3,[]
37340,AA pulls fare listings off of Orbitz sites,Business,0,"[aa, pull, fare, list, orbitz, site]"
35296,AAA: Americans ready to hit road after rough w...,Business,0,"[aaa, american, readi, hit, road, rough, winter]"
34391,AAA: Average gas price in Rhode Island up 4 ce...,Technology,3,"[aaa, averag, ga, price, rhode, island, cent, ..."
14702,AAA estimates 1.6 percent rise in statewide ho...,Business,0,"[aaa, estim, percent, rise, statewid, holiday,..."
10223,AAA expects more long-distance NJ travelers wi...,Technology,3,"[aaa, expect, longdist, nj, travel, hit, road,..."


In [None]:
# def translate(word):
#     translator = Translator()
#     return translator.translate(word)

In [None]:
# translate('복사 URL 줄이기 레이어 닫기').text

In [None]:
# translated_news_title = []
# n_processed_df = len(processed_df)
# n_batch = 1000
# for i in range(0, n_processed_df, n_batch):
#     print('Processing data: ', i+1, '-', i+n_batch)
#     batched_translated_news_title = translate(list(processed_df['News Title'])[i:i+n_batch])
#     translated_news_title = translated_news_title + [news_title.text for news_title in batched_translated_news_title]
#     print('Finish processing data: ', i+1, '-', i+n_batch)
#     time.sleep(5)
    
# print(translated_news_title)

In [None]:
# from langdetect import detect
# qqq = processed_df['News Title'].tolist()
# asdf = [detect(x) for x in qqq]
# # asdf