In [1]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from tqdm import tqdm

In [2]:
data = pd.read_csv("Articles.csv",encoding = 'ISO-8859-1')

In [3]:
data.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [4]:
data.shape

(2692, 4)

In [5]:
data['NewsType'].value_counts()

sports      1408
business    1284
Name: NewsType, dtype: int64

In [6]:
# here we are taking all the words present in "Article" section in one list named "all_words_in_Article".


all_words_in_Article  = []
for i in data['Article']:
    b=i.split(" ")
    all_words_in_Article +=b

In [7]:
# here we are converting all the non-alphabetic words or element present in "all_words_in_Article" into " "(space).

for i in range(len(all_words_in_Article)):
    if not all_words_in_Article[i].isalpha():
        all_words_in_Article[i]=""

In [8]:
# Here we are creating a counter list which will contain all the word with their number of repetitions.
counted_all_words_in_Article = Counter(all_words_in_Article)

In [9]:
len(counted_all_words_in_Article)

24159

In [10]:
# Here we are removing the " "(space) from the counter list.
del counted_all_words_in_Article[""]

In [11]:
len(counted_all_words_in_Article)

24158

In [12]:
# printing some entry in "counted_all_words_in_Article"
j = 0
for i in counted_all_words_in_Article:
    print(i," ->", counted_all_words_in_Article[i])
    j+=1
    if j==15:
        break

The  -> 1150
Sindh  -> 90
government  -> 720
has  -> 3370
decided  -> 198
to  -> 21635
bring  -> 110
down  -> 896
public  -> 181
transport  -> 38
fares  -> 12
by  -> 4446
per  -> 667
cent  -> 109
due  -> 416


In [13]:
# Here we are taking the most commonly applyed word i.e. all the words that has been used a minimum number of 3000
# in the Article section.
counted_all_words_in_Article = counted_all_words_in_Article.most_common(3000)

In [14]:
len(counted_all_words_in_Article)

3000

In [15]:
# Next comes the NLP(Natural Language Processing) part 
# Here we create a database with Columns name with all 3000 words as 3000 columns and wors as equal to the
# total number of articles present in our original database named "data".
# For that we will make a 2D numpy array and then convert them to the database.

In [16]:
features = []

for article in tqdm(data['Article']):
    blob = article.split(" ")
    
    data1 = []
    for i in counted_all_words_in_Article:
        data1.append(blob.count(i[0]))
    
    features.append(data1)


100%|██████████████████████████████████████████████████████████████████████████████| 2692/2692 [00:50<00:00, 53.58it/s]


In [17]:
len(features)


2692

In [18]:
# "features" contains a lisit of 2692 element which all contains 3000 elements.
features[0][2990:3000]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
len(features)

2692

In [20]:
data['Heading'].head()

0    sindh govt decides to cut public transport far...
1                      asia stocks up in new year trad
2             hong kong stocks open 0.66 percent lower
3               asian stocks sink euro near nine year 
4                   us oil prices slip below 50 a barr
Name: Heading, dtype: object

In [21]:
# Here we are taking all the words present in "Heading" section in one list named "all_words_in_Article".

all_words_in_heading  = []
for heading in data['Heading']:
    b=heading.split(" ")
    all_words_in_heading +=b

In [22]:
len(all_words_in_heading)

22126

In [23]:
# here we are converting all the non-alphabetic words or element present in "all_words_in_heading" into " "(space).

for i in range(len(all_words_in_heading)):
    if not all_words_in_heading[i].isalpha():
        all_words_in_heading[i]=""

In [24]:
counted_all_words_in_heading = Counter(all_words_in_heading)

In [25]:
# printing some entry in "counted_all_words_in_heading"

j = 0
for i in counted_all_words_in_heading:
    print(i," ->", counted_all_words_in_heading[i])
    j+=1
    if j==15:
        break

sindh  -> 8
govt  -> 15
decides  -> 14
to  -> 650
cut  -> 24
public  -> 3
transport  -> 3
fares  -> 3
by  -> 98
  -> 1238
kti  -> 1
rej  -> 1
asia  -> 24
stocks  -> 105
up  -> 98


In [26]:
# Here we are removing the " "(space) from the counter list.
del counted_all_words_in_heading[""]

In [27]:
counted_all_words_in_heading = counted_all_words_in_heading.most_common(1500)

In [28]:
len(counted_all_words_in_heading)

1500

In [29]:
# Next again comes the NLP(Natural Language Processing) part 
# Here we create a database with Columns name with all 1500 words as 1500 columns and wors as equal to the
# total number of articles present in our original database named "data".
# For that we will make a 2D numpy array and then convert them to the database.

In [30]:
features2 = []

for heading in tqdm(data['Heading']):
    blob = heading.split(" ")
    
    data2 = []
    for i in counted_all_words_in_heading:
        data2.append(blob.count(i[0]))
    
    features2.append(data2)


100%|████████████████████████████████████████████████████████████████████████████| 2692/2692 [00:01<00:00, 1445.20it/s]


In [31]:
len(features2)

2692

In [32]:
type(features2)

list

In [33]:
# Here we replace the entry in "NewsType" column as ('business'=0) and ('sports'=1).
data['NewsType'].replace({'business':0,'sports':1},inplace = True)

In [34]:
data.head(10)

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,0
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,0
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,0
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,0
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,0
5,New York: Oil prices tumbled Tuesday to fresh ...,1/7/2015,oil hits new 5.5 year lows as saudis defend,0
6,KARACHI: Strong bulls on Friday pulled the ben...,1/9/2015,bullish kse jumps over 33000 psychological bar...,0
7,"Singapore: Oil fell further in Asia Monday, wi...",1/12/2015,oil falls further in asian trad,0
8,KARACHI: Wholesale market rates for sugar drop...,1/13/2015,sugar prices drop to rs 49.80 in sind,0
9,SYDNEY: Oil prices fell 1 percent on Wednesday...,1/14/2015,oil extends losses as world bank cuts growth for,0


In [35]:
# Now we will add the to 2D lists with the help of hstack frature in numpy library.
article = np.array(features)
heading = np.array(features2)

In [36]:
article_and_heading_together = np.hstack((article,heading))

In [37]:
article_and_heading_together

array([[ 5,  4,  4, ...,  0,  0,  0],
       [24, 26, 26, ...,  0,  0,  0],
       [ 2,  1,  0, ...,  0,  0,  0],
       ...,
       [22,  4, 10, ...,  0,  0,  0],
       [27,  8, 11, ...,  0,  0,  0],
       [23, 12,  4, ...,  0,  0,  0]])

In [38]:
article_and_heading_together.shape

(2692, 4500)

#  And the deta is ready to be used by Machine Learning algorithmns.

In [39]:
x = article_and_heading_together

In [40]:
data.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,0
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,0
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,0
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,0
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,0


In [41]:
y = data.iloc[:,3].values

In [42]:
y.shape

(2692,)

In [43]:
x.shape

(2692, 4500)

In [44]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 7)

# MultinomialNB

In [45]:
from  sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [46]:
classifier.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
y_pred = classifier.predict(x_test)

In [48]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)              #plz run the below function and comment if your news doesnot give right answer

0.9925788497217068

In [49]:
def test_the_news():
    heading_in_f = input("Enter the news heading")
    article_in_f = input("Enter the news article")
    blob = heading_in_f.split(" ")
    
    data1 = []
    for i in counted_all_words_in_heading:
        data1.append(blob.count(i[0]))
        
    blob2 = article_in_f.split(" ")
    
    data2 = []
    for i in counted_all_words_in_Article:
        data2.append(blob2.count(i[0]))
        
    heading_in_f_np = np.array(data1)
    article_in_f_np = np.array(data2)
    return_data = np.hstack((article_in_f_np,heading_in_f_np))
    processable_data =  return_data.reshape(1,4500)
    predicted_data = int(classifier.predict(processable_data))
    if predicted_data == 0:
        return "It's a Business NEWS"
    else:
        return "It's a spotrs News"

    

In [50]:
#test_the_news()

# just call the function test_the_news() and give your heading and article to find out what is your news type.

In [51]:
from datetime import date
from datetime import time
from datetime import datetime

print(datetime.now())

2019-07-21 23:32:12.258437
