# Import necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import sklearn
import numpy as np

from tensorflow import keras
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


# Read Dataset

In [2]:
dataset = pd.read_csv('BBC News Train.csv')
test_set = pd.read_csv("BBC News Test.csv")

In [3]:
target_category = dataset['Category'].unique()
print(target_category)

['business' 'tech' 'politics' 'sport' 'entertainment']


In [4]:
dataset['categoryId'] = dataset['Category'].factorize()[0]
dataset.head()

Unnamed: 0,ArticleId,Text,Category,categoryId
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0


In [5]:
category = dataset[["Category","categoryId"]].drop_duplicates().sort_values('categoryId')
category

Unnamed: 0,Category,categoryId
0,business,0
3,tech,1
5,politics,2
6,sport,3
7,entertainment,4


In [6]:
dataset.groupby('Category').categoryId.count()

Category
business         336
entertainment    273
politics         274
sport            346
tech             261
Name: categoryId, dtype: int64

In [7]:
text = dataset["Text"] 
text.head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens in ...
3    lifestyle  governs mobile choice  faster  bett...
4    enron bosses in $168m payout eighteen former e...
Name: Text, dtype: object

In [8]:
category = dataset["Category"]
category.head()

0    business
1    business
2    business
3        tech
4    business
Name: Category, dtype: object

# Data Preprocessing

In [9]:
import preprocess
dataset['Text'] = dataset['Text'].apply(preprocess.preprocessDataset)
text = dataset['Text']
category = dataset['Category']
text.head()

0    worldcom ex bos launch defenc lawyer defend fo...
1    german busi confid slide german busi confid fe...
2    bbc poll indic econom gloom citizen major nati...
3    lifestyl govern mobil choic faster well funkie...
4    enron bos m payout eighteen former enron direc...
Name: Text, dtype: object

# Train Test Split

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(text,category, test_size = 0.3, random_state = 60,shuffle=True, stratify=category)

#print((X_train))
print(X_test)

1073    hi tech poster guid commut interact poster hel...
1062    hacker recruit pc one million comput net hijac...
122     wenger deject arsen slump arsen manag arsen we...
487     takeov offer sunderland fc bob murray chairman...
1236    jamelia return top r b star jamelia three brit...
                              ...                        
765     dove soar uk album summit manchest rock band d...
186     telegraph newspap axe job daili sunday telegra...
1292    uganda ban vagina monologu uganda author ban p...
1044    end bush denial blair tell eu toni blair urg e...
1190    rich pick hi tech thiev virus trojan malici pr...
Name: Text, Length: 447, dtype: object


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc

tfidf = CountVectorizer(
    tokenizer=dummy,
    preprocessor=dummy,
) 

# Training Model using Naive Bayes (Tf-Idf) algorithm

In [12]:
nb = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB()),])
nb.fit(X_train,Y_train)

test_predict = nb.predict(X_test)

train_accuracy = round(nb.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)


print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy ))
print("Naive Bayes Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))

Naive Bayes Train Accuracy Score : 99% 
Naive Bayes Test Accuracy Score  : 96% 

               precision    recall  f1-score   support

     business       0.98      0.94      0.96       105
         tech       0.93      1.00      0.96        76
     politics       0.93      0.94      0.93        81
        sport       1.00      0.98      0.99       106
entertainment       0.96      0.95      0.96        79

     accuracy                           0.96       447
    macro avg       0.96      0.96      0.96       447
 weighted avg       0.96      0.96      0.96       447



# Testing Model

In [13]:
test_set['Text'] = test_set['Text'].apply(preprocess.preprocessDataset)

test_id = test_set['ArticleId']
test_text = test_set['Text']
y_prdict = nb.predict(test_text)

In [14]:
y_prdict

array(['sport', 'tech', 'sport', 'business', 'sport', 'sport', 'politics',
       'politics', 'entertainment', 'business', 'business', 'tech',
       'politics', 'tech', 'entertainment', 'sport', 'politics', 'tech',
       'entertainment', 'politics', 'business', 'politics', 'sport',
       'business', 'politics', 'sport', 'business', 'sport', 'sport',
       'business', 'politics', 'tech', 'business', 'business', 'sport',
       'sport', 'sport', 'business', 'entertainment', 'politics', 'tech',
       'politics', 'entertainment', 'tech', 'sport', 'tech',
       'entertainment', 'business', 'politics', 'business', 'politics',
       'business', 'business', 'business', 'tech', 'politics', 'tech',
       'entertainment', 'sport', 'tech', 'sport', 'entertainment', 'tech',
       'politics', 'business', 'entertainment', 'sport', 'tech', 'sport',
       'sport', 'tech', 'sport', 'business', 'politics', 'tech', 'sport',
       'tech', 'tech', 'tech', 'entertainment', 'politics', 'sport',
   

In [15]:
#submission = pd.DataFrame(test_id)
submission = pd.DataFrame(list(zip(test_id, y_prdict)),
               columns =['ArticleId', 'Category'])
submission.head(20)

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
5,51,sport
6,2025,politics
7,1479,politics
8,27,entertainment
9,397,business


In [16]:
data = np.array(['movie'])
  
test_text = pd.Series(data)
#test_text = 'qpr keeper day heads for preston queens park rangers keeper chris day is set to join preston on a month s loan.  day has been displaced by the arrival of simon royce  who is in his second month on loan from charlton. qpr have also signed italian generoso rossi. r s manager ian holloway said:  some might say it s a risk as he can t be recalled during that month and simon royce can now be recalled by charlton.  but i have other irons in the fire. i have had a  yes  from a couple of others should i need them.   day s rangers contract expires in the summer. meanwhile  holloway is hoping to complete the signing of middlesbrough defender andy davies - either permanently or again on loan - before saturday s match at ipswich. davies impressed during a recent loan spell at loftus road. holloway is also chasing bristol city midfielder tom doherty.'
y_prdict = nb.predict(test_text)

In [17]:
y_prdict[0]

'sport'

# Create Pickle file

In [18]:
import pickle

file = open('nb.pkl','wb')

pickle.dump(nb,file)

In [19]:
file.close()

In [20]:
from timeit import default_timer as timer

start = timer()
data = np.array(['loss'])
  
test_text = pd.Series(data)
#test_text = 'qpr keeper day heads for preston queens park rangers keeper chris day is set to join preston on a month s loan.  day has been displaced by the arrival of simon royce  who is in his second month on loan from charlton. qpr have also signed italian generoso rossi. r s manager ian holloway said:  some might say it s a risk as he can t be recalled during that month and simon royce can now be recalled by charlton.  but i have other irons in the fire. i have had a  yes  from a couple of others should i need them.   day s rangers contract expires in the summer. meanwhile  holloway is hoping to complete the signing of middlesbrough defender andy davies - either permanently or again on loan - before saturday s match at ipswich. davies impressed during a recent loan spell at loftus road. holloway is also chasing bristol city midfielder tom doherty.'
y_prdict = nb.predict(test_text)
end = timer()
print(end - start)

0.008368600000011384


In [21]:
y_prdict[0]

'business'