In [32]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load the dataset
#Loading the train data

In [33]:
# load data
news = pd.read_csv("train.csv")
news.head(10)

Unnamed: 0,Id,TITLE,CATEGORY
0,50846,Ukraine to get $18 billion rescue from IMF,b
1,234375,McDonald's Abandons Headquarters to Avoid Prot...,b
2,63422,New study finds evidence that Autism begins in...,m
3,353942,Prime Minister Modi Says Meeting With Facebook...,t
4,311586,New robot guides at Tokyo museum almost outper...,t
5,116556,RCMP asked for delay of CRA's stolen data anno...,b
6,408119,"'Sharknado 2' Sucks In 3.9 Million Viewers, Sp...",e
7,384804,How Tim Cook Is Taking Steve Jobs' Advice to H...,t
8,16978,WATCH: Huge aquarium breaks at Disney World re...,b
9,275714,CURRENCIES: Dollar Falls Vs. Rivals; Euro Rebo...,b


In [34]:
# distribution of classes
dist = news.CATEGORY.value_counts()
dist

e    122013
b     92679
t     86846
m     36397
Name: CATEGORY, dtype: int64

# Visualize and Preprocess the data
#Retaining only alphabets (Using regular expressions)
#Removing stopwords (Using nltk library)

In [35]:
# stopwords 
stop = set(stopwords.words('english'))

# retain only alphabets
news['TITLE'] = news['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))
news['TITLE'].head(10)

0           Ukraine to get     billion rescue from IMF
1    McDonald s Abandons Headquarters to Avoid Prot...
2    New study finds evidence that Autism begins in...
3    Prime Minister Modi Says Meeting With Facebook...
4    New robot guides at Tokyo museum almost outper...
5    RCMP asked for delay of CRA s stolen data anno...
6     Sharknado    Sucks In     Million Viewers  Sp...
7    How Tim Cook Is Taking Steve Jobs  Advice to H...
8    WATCH  Huge aquarium breaks at Disney World re...
9    CURRENCIES  Dollar Falls Vs  Rivals  Euro Rebo...
Name: TITLE, dtype: object

In [36]:
# convert to lowercase and tokenize
news['TITLE'] = news['TITLE'].apply(lambda x:x.lower().split())
news['TITLE'].head(10)

0       [ukraine, to, get, billion, rescue, from, imf]
1    [mcdonald, s, abandons, headquarters, to, avoi...
2    [new, study, finds, evidence, that, autism, be...
3    [prime, minister, modi, says, meeting, with, f...
4    [new, robot, guides, at, tokyo, museum, almost...
5    [rcmp, asked, for, delay, of, cra, s, stolen, ...
6    [sharknado, sucks, in, million, viewers, spits...
7    [how, tim, cook, is, taking, steve, jobs, advi...
8    [watch, huge, aquarium, breaks, at, disney, wo...
9    [currencies, dollar, falls, vs, rivals, euro, ...
Name: TITLE, dtype: object

In [37]:
# remove stopwords
news['TITLE'] = news['TITLE'].apply(lambda x:[i for i in x if i not in stop])
news['TITLE'].head(10)

0                 [ukraine, get, billion, rescue, imf]
1    [mcdonald, abandons, headquarters, avoid, prot...
2    [new, study, finds, evidence, autism, begins, ...
3    [prime, minister, modi, says, meeting, faceboo...
4    [new, robot, guides, tokyo, museum, almost, ou...
5    [rcmp, asked, delay, cra, stolen, data, announ...
6    [sharknado, sucks, million, viewers, spits, on...
7    [tim, cook, taking, steve, jobs, advice, heart...
8    [watch, huge, aquarium, breaks, disney, world,...
9    [currencies, dollar, falls, vs, rivals, euro, ...
Name: TITLE, dtype: object

In [38]:
# join list elements
news['TITLE'] = news['TITLE'].apply(lambda x: ' '.join(x))
news['TITLE'].head(10)

0                       ukraine get billion rescue imf
1      mcdonald abandons headquarters avoid protesters
2          new study finds evidence autism begins womb
3    prime minister modi says meeting facebook coo ...
4    new robot guides tokyo museum almost outperfor...
5        rcmp asked delay cra stolen data announcement
6    sharknado sucks million viewers spits one bill...
7    tim cook taking steve jobs advice heart apple ceo
8    watch huge aquarium breaks disney world restau...
9      currencies dollar falls vs rivals euro rebounds
Name: TITLE, dtype: object

# split into training and test sets

In [39]:
X_train, X_test, y_train, y_test = train_test_split(news["TITLE"],news["CATEGORY"], test_size = 0.2, random_state = 3)

In [40]:
# initialize count vectorizer
count_vectorizer = CountVectorizer()

# initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))

# fit and transform with count vectorizer
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# fit and transform with tfidf vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(X_train_count.shape, y_train.shape)
print(X_test_tfidf.shape, y_test.shape)

(270348, 42616) (270348,)
(67587, 1665439) (67587,)


In [41]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(270348,) (270348,)
(67587,) (67587,)


# Model Building

In [42]:
# initialize multinomial naive bayes
nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

# fit on count vectorizer training data
nb_1.fit(X_train_count, y_train)

# fit on tfidf vectorizer training data
nb_2.fit(X_train_tfidf, y_train)

# accuracy with count vectorizer
acc_count_nb = accuracy_score(nb_1.predict(X_test_count), y_test)

# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), y_test)

# display accuracies
print(acc_count_nb, acc_tfidf_nb)

0.9272641188394218 0.9295870507642002


In [43]:
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

# fit on count vectorizer training data
logreg_1.fit(X_train_count, y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), y_test)

# display accuracies
print(acc_count_logreg, acc_tfidf_logreg)



0.946099101898294 0.9420450678384896


# Prediction on the test data and creating the sample submission file

In [45]:
# Prediction on test data

# Read the test data
test = pd.read_csv('test.csv')
test.head(10)

Unnamed: 0,Id,TITLE
0,86998,A simple blood test to detect 'solid' cancers?
1,112926,Mozilla appoints veteran Chris Beard as interi...
2,280943,FDA Abruptly Reverses Stance on Wooden Aging B...
3,37154,Cancer stats confirm value of colonoscopy
4,152800,"Apple, Samsung playing games citing big number..."
5,412956,Amazon to Buy Video Gaming Site Twitch for Mor...
6,197094,Google Shopping Launches Same-Day Delivery In ...
7,117620,Spacecraft survives 'blood moon' eclipse seen ...
8,14854,Madonna dresses up as Daenerys Targaryen from ...
9,342032,Pippa Middleton Bum Style Maintains 'Normal' R...


In [47]:
# Storing the id from the test file
id_ = test['Id']

# Apply the transformations on test
# retain only alphabets
test['TITLE'] = test['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
test['TITLE'] = test['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
test['TITLE'] = test['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
test['TITLE'] = test['TITLE'].apply(lambda x: ' '.join(x))

test_count = count_vectorizer.transform(test['TITLE'])
test_tfidf = tfidf_vectorizer.transform(test['TITLE'])

# Predict on the test data
y_pred_test = logreg_1.predict(test_count)
print(test_count.shape)
y_pred_test = y_pred_test.flatten()

# Create a sample submission file
sample_submission = pd.DataFrame({'Id':id_, 'CATEGORY':y_pred_test})
print(sample_submission.head(20))

# Convert the sample submission file into a csv file
sample_submission.to_csv('sample_submisson.csv',index=False)



(84484, 42616)
        Id CATEGORY
0    86998        m
1   112926        t
2   280943        m
3    37154        m
4   152800        t
5   412956        t
6   197094        t
7   117620        t
8    14854        e
9   342032        e
10    3986        e
11  383521        b
12   85462        e
13   68984        t
14  218666        m
15   63219        e
16   22598        e
17  324981        e
18  347188        e
19  129828        e
