In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("BBC_News_Train.csv")
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [3]:
df.describe()

Unnamed: 0,ArticleId
count,1490.0
mean,1119.696644
std,641.826283
min,2.0
25%,565.25
50%,1112.5
75%,1680.75
max,2224.0


In [4]:
df['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

### 1. Preprocessing the dataset

##### Remove any unnecessary columns.

In [5]:
df.drop(['ArticleId'], axis=1, inplace=True)
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


##### Removing Punctuations and Tokenizing

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]+")

def remove_punctuations_and_tokenize(string):
    
    return tokenizer.tokenize(string)

df["Tokenized and Punctuators Removed"] = df["Text"].apply(remove_punctuations_and_tokenize)
df.head()

Unnamed: 0,Text,Category,Tokenized and Punctuators Removed
0,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex, boss, launches, defence, lawyer..."
1,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,..."
2,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize..."
3,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b..."
4,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, 168m, payout, eighteen, fo..."


##### Removing Stopwords

In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(word_list):
    
    stopwords_removed = []
    
    for word in word_list:
        if word not in stop_words:
            stopwords_removed.append(word)
            
    return stopwords_removed

df["Stopwords Removed"] = df["Tokenized and Punctuators Removed"].apply(remove_stopwords)
df.head()

Unnamed: 0,Text,Category,Tokenized and Punctuators Removed,Stopwords Removed
0,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer..."
1,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former..."


##### Lowercasing

In [8]:
def lowercase(word_list):
    
    words_lowered = []
    
    for word in word_list:
        words_lowered.append(word.lower())
        
    return words_lowered

df["Lowercased"] = df["Stopwords Removed"].apply(lowercase)
df.head()

Unnamed: 0,Text,Category,Tokenized and Punctuators Removed,Stopwords Removed,Lowercased
0,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer..."
1,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former..."


##### Lemmetizing

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmetizer(word_list):
    
    lemmetized = []
    
    for word in word_list:
        lemmetized.append(lemmatizer.lemmatize(word))
        
    return lemmetized

df["Lemmetized"] = df["Lowercased"].apply(lemmetizer)
df.head()

Unnamed: 0,Text,Category,Tokenized and Punctuators Removed,Stopwords Removed,Lowercased,Lemmetized
0,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, bos, launch, defence, lawyer, d..."
1,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slide, german, ..."
2,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former...","[enron, boss, 168m, payout, eighteen, former, ..."


In [10]:
df["Lemmetized"][1]

['german',
 'business',
 'confidence',
 'slide',
 'german',
 'business',
 'confidence',
 'fell',
 'february',
 'knocking',
 'hope',
 'speedy',
 'recovery',
 'europe',
 'largest',
 'economy',
 'munich',
 'based',
 'research',
 'institute',
 'ifo',
 'said',
 'confidence',
 'index',
 'fell',
 '95',
 '5',
 'february',
 '97',
 '5',
 'january',
 'first',
 'decline',
 'three',
 'month',
 'study',
 'found',
 'outlook',
 'manufacturing',
 'retail',
 'sector',
 'worsened',
 'observer',
 'hoping',
 'confident',
 'business',
 'sector',
 'would',
 'signal',
 'economic',
 'activity',
 'picking',
 'surprised',
 'ifo',
 'index',
 'taken',
 'knock',
 'said',
 'dz',
 'bank',
 'economist',
 'bernd',
 'weidensteiner',
 'main',
 'reason',
 'probably',
 'domestic',
 'economy',
 'still',
 'weak',
 'particularly',
 'retail',
 'trade',
 'economy',
 'labour',
 'minister',
 'wolfgang',
 'clement',
 'called',
 'dip',
 'february',
 'ifo',
 'confidence',
 'figure',
 'mild',
 'decline',
 'said',
 'despite',
 'retrea

##### Implementing TF-ICF Weighting Scheme

##### Term Frequency (TF)

In [11]:
categories = df["Category"].unique()
categories

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [12]:
df_business = df.groupby("Category").get_group("business")
df_business

Unnamed: 0,Text,Category,Tokenized and Punctuators Removed,Stopwords Removed,Lowercased,Lemmetized
0,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, boss, launches, defence, lawyer...","[worldcom, ex, bos, launch, defence, lawyer, d..."
1,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slide, german, ..."
2,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
4,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, 168m, payout, eighteen, fo...","[enron, bosses, 168m, payout, eighteen, former...","[enron, bosses, 168m, payout, eighteen, former...","[enron, boss, 168m, payout, eighteen, former, ..."
8,car giant hit by mercedes slump a slump in pro...,business,"[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, mercedes, slump, slump, prof...","[car, giant, hit, mercedes, slump, slump, prof...","[car, giant, hit, mercedes, slump, slump, prof..."
...,...,...,...,...,...,...
1474,indy buys into india paper irish publishing gr...,business,"[indy, buys, into, india, paper, irish, publis...","[indy, buys, india, paper, irish, publishing, ...","[indy, buys, india, paper, irish, publishing, ...","[indy, buy, india, paper, irish, publishing, g..."
1479,high fuel costs hit us airlines two of the lar...,business,"[high, fuel, costs, hit, us, airlines, two, of...","[high, fuel, costs, hit, us, airlines, two, la...","[high, fuel, costs, hit, us, airlines, two, la...","[high, fuel, cost, hit, u, airline, two, large..."
1480,qantas sees profits fly to record australian a...,business,"[qantas, sees, profits, fly, to, record, austr...","[qantas, sees, profits, fly, record, australia...","[qantas, sees, profits, fly, record, australia...","[qantas, see, profit, fly, record, australian,..."
1484,hyundai to build new india plant south korea s...,business,"[hyundai, to, build, new, india, plant, south,...","[hyundai, build, new, india, plant, south, kor...","[hyundai, build, new, india, plant, south, kor...","[hyundai, build, new, india, plant, south, kor..."


##### Calculating Term Frequency of each class

In [13]:
word_list_business = []

for i in df_business.index:
    
    word_list_business.extend(df_business["Lemmetized"][i])
    
word_list_business

['worldcom',
 'ex',
 'bos',
 'launch',
 'defence',
 'lawyer',
 'defending',
 'former',
 'worldcom',
 'chief',
 'bernie',
 'ebbers',
 'battery',
 'fraud',
 'charge',
 'called',
 'company',
 'whistleblower',
 'first',
 'witness',
 'cynthia',
 'cooper',
 'worldcom',
 'ex',
 'head',
 'internal',
 'accounting',
 'alerted',
 'director',
 'irregular',
 'accounting',
 'practice',
 'u',
 'telecom',
 'giant',
 '2002',
 'led',
 'collapse',
 'firm',
 'following',
 'discovery',
 '11bn',
 '5',
 '7bn',
 'accounting',
 'fraud',
 'mr',
 'ebbers',
 'pleaded',
 'guilty',
 'charge',
 'fraud',
 'conspiracy',
 'prosecution',
 'lawyer',
 'argued',
 'mr',
 'ebbers',
 'orchestrated',
 'series',
 'accounting',
 'trick',
 'worldcom',
 'ordering',
 'employee',
 'hide',
 'expense',
 'inflate',
 'revenue',
 'meet',
 'wall',
 'street',
 'earnings',
 'estimate',
 'm',
 'cooper',
 'run',
 'consulting',
 'business',
 'told',
 'jury',
 'new',
 'york',
 'wednesday',
 'external',
 'auditor',
 'arthur',
 'andersen',
 'appr

In [14]:
tf_business = {}

for word in set(word_list_business):
    tf_business[word] = word_list_business.count(word)
    
tf_business

{'chunk': 4,
 'onward': 1,
 'hugues': 1,
 'malta': 1,
 'convenient': 1,
 'uneven': 1,
 'relaxed': 1,
 'heating': 4,
 'bourse': 1,
 'august': 19,
 'fargo': 6,
 'marc': 3,
 'jon': 1,
 '177': 1,
 'example': 11,
 'broadly': 3,
 'addressed': 2,
 'entertainment': 2,
 'busy': 3,
 '2871': 1,
 'sound': 5,
 'conjunction': 1,
 'act': 14,
 'lerach': 1,
 'suspending': 1,
 'permanently': 1,
 'claimant': 1,
 'sits': 1,
 'hydraulic': 1,
 'assumes': 1,
 'dell': 2,
 'erase': 1,
 'abroad': 13,
 'prevent': 7,
 'anyone': 3,
 'strange': 1,
 '15': 45,
 'miracle': 1,
 'bos': 25,
 'suspended': 2,
 'threefold': 1,
 'buy': 62,
 'bigger': 16,
 'section': 2,
 'tackle': 5,
 '187': 1,
 'paper': 11,
 'optimistic': 16,
 'rich': 11,
 'ohio': 1,
 'exile': 2,
 'shaheed': 2,
 'failed': 22,
 'insists': 2,
 'jubilee': 1,
 'pickup': 2,
 '26bn': 1,
 'elected': 6,
 'digit': 4,
 'approach': 15,
 'zero': 2,
 'chernin': 3,
 'bank': 250,
 'yuganskneftegaz': 7,
 'rough': 1,
 'spinner': 1,
 'sahara': 1,
 'succeed': 5,
 'switzerland'

In [15]:
df_tech = df.groupby("Category").get_group("tech")

word_list_tech = []
for i in df_tech.index:
    word_list_tech.extend(df_tech["Lemmetized"][i])

tf_tech = {}
for word in set(word_list_tech):
    tf_tech[word] = word_list_tech.count(word)
    
tf_tech

{'subway': 1,
 'chunk': 4,
 'plundering': 1,
 'subscribes': 1,
 'convenient': 6,
 'scouring': 1,
 'august': 15,
 'marc': 2,
 'reconditioned': 3,
 'remembrance': 1,
 'xboxes': 3,
 'example': 24,
 'broadly': 1,
 'entertainment': 58,
 'addressed': 2,
 'busy': 5,
 'insane': 2,
 'jonas': 1,
 'bandwagon': 1,
 'hewland': 1,
 'sound': 39,
 'conjunction': 1,
 'act': 21,
 'garden': 2,
 'antenna': 2,
 'oasis': 1,
 'restitution': 1,
 'visuals': 2,
 'pivotal': 1,
 'carfax': 1,
 'sits': 4,
 'hydraulic': 1,
 'revolution': 20,
 'dell': 5,
 'erase': 2,
 'abroad': 2,
 'prevent': 12,
 'anyone': 24,
 'transferring': 1,
 'fearsomely': 1,
 'skype': 19,
 'strange': 3,
 '15': 32,
 'clint': 1,
 'suspended': 6,
 'bos': 4,
 'section': 2,
 'buy': 39,
 'bigger': 13,
 'housewife': 2,
 'tackle': 4,
 'coleman': 2,
 'paper': 12,
 '40mbps': 2,
 'optimistic': 7,
 'rich': 6,
 'itunes': 19,
 'outline': 1,
 'failed': 11,
 'ethical': 1,
 'digit': 2,
 'approach': 14,
 '320': 1,
 'zero': 2,
 'amputee': 2,
 'gwen': 2,
 'mate':

In [16]:
df_politics = df.groupby("Category").get_group("politics")

word_list_politics = []
for i in df_politics.index:
    word_list_politics.extend(df_politics["Lemmetized"][i])

tf_politics = {}
for word in set(word_list_politics):
    tf_politics[word] = word_list_politics.count(word)
    
tf_politics

{'chunk': 1,
 'whitehall': 6,
 'insurrection': 1,
 'doomed': 2,
 'august': 1,
 'example': 27,
 '177': 1,
 'entertainment': 2,
 'insane': 6,
 'stricter': 1,
 'sound': 10,
 'act': 61,
 'appalled': 1,
 'leant': 2,
 'permanently': 5,
 'pivotal': 3,
 'ibrox': 1,
 'slum': 1,
 '60p': 1,
 'sits': 2,
 'cathy': 2,
 'assumes': 1,
 'revolution': 1,
 'abroad': 11,
 'anyone': 17,
 'prevent': 9,
 'ridicule': 1,
 '15': 15,
 'suspended': 15,
 'blight': 2,
 'bos': 4,
 'section': 10,
 'buy': 15,
 'bigger': 8,
 'clarified': 2,
 'sheppey': 2,
 'tackle': 30,
 'expletive': 2,
 'paper': 52,
 'optimistic': 3,
 'uganda': 1,
 'confederation': 1,
 'rich': 7,
 'outline': 6,
 'failed': 35,
 'insists': 4,
 'regulating': 1,
 'elected': 30,
 'ethical': 1,
 'digit': 1,
 'approach': 20,
 'zero': 3,
 'uncollected': 1,
 'mate': 7,
 'bank': 7,
 'rough': 2,
 'amritsar': 2,
 'druid': 1,
 'lucy': 6,
 'hung': 2,
 'disagrees': 1,
 'succeed': 6,
 'compatible': 1,
 'outlined': 11,
 'switzerland': 4,
 '83': 2,
 'elizabeth': 3,
 '1

In [17]:
df_entertainment = df.groupby("Category").get_group("entertainment")

word_list_entertainment = []
for i in df_entertainment.index:
    word_list_entertainment.extend(df_entertainment["Lemmetized"][i])

tf_entertainment = {}
for word in set(word_list_entertainment):
    tf_entertainment[word] = word_list_entertainment.count(word)
    
tf_entertainment

{'plundering': 1,
 'pap': 1,
 'parted': 1,
 'improvised': 1,
 'rundown': 3,
 'august': 7,
 'marc': 6,
 'painstaking': 1,
 'jon': 3,
 'example': 3,
 'monologue': 3,
 'pondered': 1,
 'entertainment': 27,
 'busy': 2,
 'clancy': 1,
 'marmalade': 1,
 'sound': 25,
 'christine': 1,
 'conjunction': 1,
 'act': 64,
 'cursed': 1,
 'garden': 5,
 'antenna': 1,
 'leant': 1,
 'stripy': 1,
 'oasis': 3,
 'expat': 1,
 'permanently': 1,
 'sits': 1,
 'batman': 1,
 'revolution': 3,
 'anyone': 11,
 'bonnie': 1,
 'prevent': 3,
 'audition': 4,
 'strange': 3,
 'hoffman': 11,
 '15': 17,
 'clint': 20,
 'suspended': 2,
 'bos': 2,
 'beacon': 1,
 'section': 6,
 'buy': 15,
 'bigger': 6,
 'housewife': 10,
 'noble': 1,
 'mantle': 2,
 'tackle': 2,
 'expletive': 1,
 'paper': 4,
 'optimistic': 1,
 'uganda': 5,
 'outsider': 2,
 'rich': 1,
 'topper': 5,
 'itunes': 7,
 'ohio': 1,
 'failed': 16,
 'jolin': 5,
 'broccoli': 1,
 'mutant': 4,
 'approach': 1,
 'zero': 1,
 'gwen': 2,
 'mate': 1,
 'lurking': 1,
 'bank': 13,
 'rough'

In [18]:
df_sport = df.groupby("Category").get_group("sport")

word_list_sport = []
for i in df_sport.index:
    word_list_sport.extend(df_sport["Lemmetized"][i])

tf_sport = {}
for word in set(word_list_sport):
    tf_sport[word] = word_list_sport.count(word)
    
tf_sport

{'chunk': 1,
 'papadoyiannakis': 2,
 'pap': 1,
 'ensuing': 1,
 'makelele': 5,
 'convenient': 1,
 'relaxed': 3,
 'unruffled': 1,
 'kotoko': 1,
 'batistuta': 1,
 'august': 27,
 'jon': 6,
 'example': 13,
 'addressed': 1,
 'uninformed': 1,
 'busy': 2,
 'jonas': 1,
 'kezman': 12,
 'sound': 1,
 'conjunction': 1,
 'baros': 7,
 'act': 6,
 'garden': 1,
 'ibrox': 4,
 'sits': 1,
 'pragmatist': 1,
 'desailly': 6,
 'kostas': 10,
 'neri': 1,
 'caretaker': 3,
 '1815': 1,
 'effortlessly': 1,
 'erase': 3,
 'abroad': 3,
 'prevent': 8,
 'anyone': 10,
 'ace': 8,
 'strange': 4,
 '15': 31,
 'suspended': 22,
 'bos': 75,
 'miracle': 3,
 'section': 1,
 'buy': 6,
 'bigger': 8,
 'krajicek': 1,
 'tackle': 25,
 'mantle': 1,
 '187': 2,
 'fowler': 12,
 'southwell': 5,
 'paper': 5,
 'modification': 1,
 'optimistic': 5,
 'confederation': 1,
 'bruise': 1,
 'nia': 1,
 'rich': 1,
 'outline': 1,
 'failed': 41,
 'insists': 16,
 'teak': 1,
 'jauregi': 1,
 'ignominy': 1,
 'elected': 1,
 'ethical': 1,
 'approach': 10,
 'zero'

##### Class Frequency (CF)

In [19]:
class_frequency = {}

for word in set(word_list_business+word_list_politics+word_list_entertainment+word_list_sport+word_list_tech):
    
    count = 0
    
    if word in word_list_business:
        count+=1
    if word in word_list_tech:
        count+=1
    if word in word_list_entertainment:
        count+=1
    if word in word_list_sport:
        count+=1
    if word in word_list_politics:
        count+=1
        
    class_frequency[word] = count
    
class_frequency

{'chunk': 4,
 'papadoyiannakis': 1,
 'subscribes': 1,
 'parted': 1,
 'improvised': 1,
 'relaxed': 2,
 'fargo': 1,
 'entertainment': 4,
 'busy': 4,
 'clancy': 1,
 '2871': 1,
 'kezman': 1,
 'hewland': 1,
 'christine': 1,
 'garden': 3,
 'leant': 2,
 'stripy': 1,
 'oasis': 2,
 'claimant': 1,
 '60p': 1,
 'caretaker': 1,
 'ace': 1,
 'audition': 1,
 'miracle': 2,
 'suspended': 5,
 'blight': 1,
 'beacon': 1,
 'buy': 5,
 'housewife': 2,
 'krajicek': 1,
 'tackle': 5,
 'optimistic': 5,
 '40mbps': 1,
 'confederation': 2,
 'topper': 1,
 'ohio': 2,
 'outline': 3,
 'failed': 5,
 'insists': 3,
 'pickup': 1,
 'ignominy': 1,
 '26bn': 1,
 'elected': 3,
 'yuganskneftegaz': 1,
 'amritsar': 1,
 'spinner': 2,
 'unplug': 1,
 'finsihed': 1,
 'pleaded': 4,
 'slowdown': 2,
 'admonished': 1,
 'signature': 3,
 'harassed': 1,
 'paradise': 2,
 'provocation': 1,
 'elizabeth': 4,
 'wolstenholme': 1,
 'frid': 1,
 'spall': 1,
 'bethnal': 1,
 'ravenous': 1,
 'nervelessly': 1,
 'unblighted': 1,
 'dogged': 2,
 'advisory': 

##### Inverse Class Frequency

In [20]:
inverse_class_frequency = {}

for word in class_frequency:
    
    inverse_class_frequency[word] = np.log10(5/class_frequency[word])
    
inverse_class_frequency

{'chunk': 0.09691001300805642,
 'papadoyiannakis': 0.6989700043360189,
 'subscribes': 0.6989700043360189,
 'parted': 0.6989700043360189,
 'improvised': 0.6989700043360189,
 'relaxed': 0.3979400086720376,
 'fargo': 0.6989700043360189,
 'entertainment': 0.09691001300805642,
 'busy': 0.09691001300805642,
 'clancy': 0.6989700043360189,
 '2871': 0.6989700043360189,
 'kezman': 0.6989700043360189,
 'hewland': 0.6989700043360189,
 'christine': 0.6989700043360189,
 'garden': 0.2218487496163564,
 'leant': 0.3979400086720376,
 'stripy': 0.6989700043360189,
 'oasis': 0.3979400086720376,
 'claimant': 0.6989700043360189,
 '60p': 0.6989700043360189,
 'caretaker': 0.6989700043360189,
 'ace': 0.6989700043360189,
 'audition': 0.6989700043360189,
 'miracle': 0.3979400086720376,
 'suspended': 0.0,
 'blight': 0.6989700043360189,
 'beacon': 0.6989700043360189,
 'buy': 0.0,
 'housewife': 0.3979400086720376,
 'krajicek': 0.6989700043360189,
 'tackle': 0.0,
 'optimistic': 0.0,
 '40mbps': 0.6989700043360189,
 '

##### TF-ICF for each class

In [21]:
tf_icf_business = {}

for word in tf_business:
    
    tf_icf_business[word] = tf_business[word]*inverse_class_frequency[word] 
    
tf_icf_business

{'chunk': 0.3876400520322257,
 'onward': 0.6989700043360189,
 'hugues': 0.6989700043360189,
 'malta': 0.6989700043360189,
 'convenient': 0.2218487496163564,
 'uneven': 0.6989700043360189,
 'relaxed': 0.3979400086720376,
 'heating': 2.7958800173440754,
 'bourse': 0.6989700043360189,
 'august': 0.0,
 'fargo': 4.193820026016113,
 'marc': 0.6655462488490692,
 'jon': 0.2218487496163564,
 '177': 0.3979400086720376,
 'example': 0.0,
 'broadly': 1.193820026016113,
 'addressed': 0.4436974992327128,
 'entertainment': 0.19382002601611284,
 'busy': 0.2907300390241693,
 '2871': 0.6989700043360189,
 'sound': 0.0,
 'conjunction': 0.09691001300805642,
 'act': 0.0,
 'lerach': 0.6989700043360189,
 'suspending': 0.6989700043360189,
 'permanently': 0.2218487496163564,
 'claimant': 0.6989700043360189,
 'sits': 0.0,
 'hydraulic': 0.3979400086720376,
 'assumes': 0.3979400086720376,
 'dell': 0.7958800173440752,
 'erase': 0.2218487496163564,
 'abroad': 1.2598301691047336,
 'prevent': 0.0,
 'anyone': 0.0,
 'str

In [22]:
tf_icf_sport = {}
for word in tf_sport:
    tf_icf_sport[word] = tf_sport[word]*inverse_class_frequency[word] 
    
tf_icf_entertainment = {}
for word in tf_entertainment:
    tf_icf_entertainment[word] = tf_entertainment[word]*inverse_class_frequency[word] 
    
tf_icf_politics = {}
for word in tf_politics:
    tf_icf_politics[word] = tf_politics[word]*inverse_class_frequency[word] 
    
tf_icf_tech = {}
for word in tf_tech:
    tf_icf_tech[word] = tf_tech[word]*inverse_class_frequency[word] 

##### Calculating word vector for each sentence and creating the corresponding DataFrame

In [23]:
un_words = set(word_list_business+word_list_politics+word_list_entertainment+word_list_sport+word_list_tech)
zero_data = np.zeros(shape=(1490,len(un_words)))
vector_matrix_df = pd.DataFrame(zero_data, columns=un_words)
vector_matrix_df["Target"] = [0 for i in df.index]
vector_matrix_df

Unnamed: 0,chunk,papadoyiannakis,subscribes,parted,improvised,relaxed,fargo,entertainment,busy,clancy,...,jonatan,82,387m,wipro,rediscovered,folded,goya,sponsored,soldier,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [24]:
count = 0
    
for i in df_business["Lemmetized"]:
    vector_matrix_df.at[count, "Target"] = "business"
    for j in i:
        vector_matrix_df.at[count, j] = tf_icf_business[j]
    count+=1

In [25]:
for i in df_sport["Lemmetized"]:
    vector_matrix_df.at[count, "Target"] = "sport"
    for j in i:
        vector_matrix_df.at[count, j] = tf_icf_sport[j]
    count+=1

In [26]:
for i in df_tech["Lemmetized"]:
    vector_matrix_df.at[count, "Target"] = "tech"
    for j in i:
        vector_matrix_df.at[count, j] = tf_icf_tech[j]
    count+=1
for i in df_entertainment["Lemmetized"]:
    vector_matrix_df.at[count, "Target"] = "entertainment"
    for j in i:
        vector_matrix_df.at[count, j] = tf_icf_entertainment[j]
    count+=1
for i in df_politics["Lemmetized"]:
    vector_matrix_df.at[count, "Target"] = "politics"
    for j in i:
        vector_matrix_df.at[count, j] = tf_icf_politics[j]
    count+=1

In [27]:
vector_matrix_df

Unnamed: 0,chunk,papadoyiannakis,subscribes,parted,improvised,relaxed,fargo,entertainment,busy,clancy,...,jonatan,82,387m,wipro,rediscovered,folded,goya,sponsored,soldier,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,business
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,business
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,business
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,business
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,business
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,politics
1486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,politics
1487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,politics
1488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,politics


In [28]:
vector_matrix_df["tech"].value_counts()

0.0000     1459
15.9176      27
1.9897        4
Name: tech, dtype: int64

In [29]:
vector_matrix_df["inconsistency"].value_counts()

0.00000    1489
0.69897       1
Name: inconsistency, dtype: int64

In [30]:
vector_matrix_df["propensity"].value_counts()

0.00000    1489
0.69897       1
Name: propensity, dtype: int64

##### LabelEncoding the target column

In [31]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
vector_matrix_df["Target"]= le.fit_transform(vector_matrix_df["Target"])

vector_matrix_df = vector_matrix_df.sample(frac = 1).reset_index(drop=True)
vector_matrix_df

Unnamed: 0,chunk,papadoyiannakis,subscribes,parted,improvised,relaxed,fargo,entertainment,busy,clancy,...,jonatan,82,387m,wipro,rediscovered,folded,goya,sponsored,soldier,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


### 2. The Dataset

##### Split the BBC train dataset into training and testing sets. Use a 70:30 split for the training and testing sets, respectively.

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vector_matrix_df.drop("Target", axis=1), vector_matrix_df["Target"],
                                   random_state=104, 
                                   test_size=0.3, 
                                   shuffle=True)

In [33]:
vector_matrix_df.shape

(1490, 22070)

In [34]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1043, 22069)
(447, 22069)
(1043,)
(447,)


In [35]:
len_df = df.shape[0]

len_business = df_business.shape[0]
len_sports = df_sport.shape[0]
len_tech = df_tech.shape[0]
len_entertainment = df_entertainment.shape[0]
len_politics = df_politics.shape[0]

### 3. Training the Naive Bayes classifier with TF-ICF:

##### Implement the Naive Bayes classifier with the TF-ICF weighting scheme.

In [41]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

##### Calculate the probability of each category based on the frequency of documents in the training set that belong to that category. 

In [36]:
print("Probability of Class Business =", len_business/len_df)

Probability of Class Business = 0.22550335570469798


In [37]:
print("Probability of Class Entertainment =", len_entertainment/len_df)

Probability of Class Entertainment = 0.18322147651006712


In [38]:
print("Probability of Class Sports =", len_sports/len_df)

Probability of Class Sports = 0.23221476510067113


In [39]:
print("Probability of Class Tech =", len_tech/len_df)

Probability of Class Tech = 0.17516778523489934


In [40]:
print("Probability of Class Politics =", len_politics/len_df)

Probability of Class Politics = 0.18389261744966443


### 4. Testing the Naive Bayes classifier with TF-ICF:

In [42]:
from sklearn import metrics

y_pred = naive_bayes_classifier.predict(X_test)

score = metrics.accuracy_score(y_test, y_pred)
print(score)

1.0


In [43]:
metrics.confusion_matrix(y_test, y_pred)

array([[103,   0,   0,   0,   0],
       [  0,  75,   0,   0,   0],
       [  0,   0,  88,   0,   0],
       [  0,   0,   0,  99,   0],
       [  0,   0,   0,   0,  82]], dtype=int64)

In [44]:
pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
support,103.0,75.0,88.0,99.0,82.0,1.0,447.0,447.0


### 5. Improving the classifier

##### Experiment with different preprocessing techniques and parameters to improve the performance of the classifier (including different splits like 60-40, 80-20, 50-50).

In [45]:
for i in [50,40,20,10]:
    
    X_train, X_test, y_train, y_test = train_test_split(vector_matrix_df.drop("Target", axis=1), vector_matrix_df["Target"],
                                   random_state=104, 
                                   test_size=i, 
                                   shuffle=True)
    
    naive_bayes_classifier.fit(X_train, y_train)
    y_pred = naive_bayes_classifier.predict(X_test)
    
    print("For the Split ", 100-i, ":", i, ", results are")
    
    print("Accuracy", metrics.accuracy_score(y_test, y_pred), "\n")
    print("Confusion Matrix", metrics.confusion_matrix(y_test, y_pred), "\n")
    print("Classification Report", pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)), "\n")

For the Split  50 : 50 , results are
Accuracy 1.0 

Confusion Matrix [[ 9  0  0  0  0]
 [ 0  9  0  0  0]
 [ 0  0 10  0  0]
 [ 0  0  0  9  0]
 [ 0  0  0  0 13]] 

Classification Report              0    1     2    3     4  accuracy  macro avg  weighted avg
precision  1.0  1.0   1.0  1.0   1.0       1.0        1.0           1.0
recall     1.0  1.0   1.0  1.0   1.0       1.0        1.0           1.0
f1-score   1.0  1.0   1.0  1.0   1.0       1.0        1.0           1.0
support    9.0  9.0  10.0  9.0  13.0       1.0       50.0          50.0 

For the Split  60 : 40 , results are
Accuracy 1.0 

Confusion Matrix [[ 6  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0  9  0  0]
 [ 0  0  0  7  0]
 [ 0  0  0  0 11]] 

Classification Report              0    1    2    3     4  accuracy  macro avg  weighted avg
precision  1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
recall     1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
f1-score   1.0  1.0  1.0  1.0   1.0       1.0        1.0 

##### TF-IDF Implementation

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

X = df["Text"]
y = df["Category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, test_size=30, shuffle=True)

tf_vectorizer = CountVectorizer()
X_train_tf = tf_vectorizer.fit_transform(X_train)
X_test_tf = tf_vectorizer.transform(X_test)

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)
y_pred = naive_bayes_classifier.predict(X_test_tf)
print("accuracy: ", metrics.accuracy_score(y_test, y_pred))

accuracy:  0.9333333333333333


In [47]:
metrics.confusion_matrix(y_test, y_pred)

array([[5, 0, 1, 0, 0],
       [0, 5, 1, 0, 0],
       [0, 0, 5, 0, 0],
       [0, 0, 0, 9, 0],
       [0, 0, 0, 0, 4]], dtype=int64)

In [48]:
pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,business,entertainment,politics,sport,tech,accuracy,macro avg,weighted avg
precision,1.0,1.0,0.714286,1.0,1.0,0.933333,0.942857,0.952381
recall,0.833333,0.833333,1.0,1.0,1.0,0.933333,0.933333,0.933333
f1-score,0.909091,0.909091,0.833333,1.0,1.0,0.933333,0.930303,0.935859
support,6.0,6.0,5.0,9.0,4.0,0.933333,30.0,30.0


### 6. Conclusion

##### Write a brief report summarizing your findings

Findings - TF-ICF vectorization is a better method than TF-IDF to quantify the importance/relevance of documents. In my findings, TF-IDF is giving the accuracy 93.33% where as Tf-ICF is giving the accuracy of 100%. The precision, recall, f1-score all are 1 for the case.

##### Discuss the performance of the classifier and the impact of different preprocessing techniques, features, and weighting schemes on the results.

Even upon trying the same for different splits, i.e. 50:50, 60:40, 70:30, 80:20, 90:10, we still get the accuracy of 100% for TF-ICF. Applying TF-IDF, we get less accuracy of 93.33%