# News Classification using SVM and Naive Bayes


In [186]:
import pandas as pd
news = pd.read_csv("../Datasets/AllNews.csv")
news

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,subject,date,tokenize_text,fake/true
0,0,5230,Ex-President Bush says hopeful despite 'pretty...,(Reuters) - Former U.S. president George W. Bu...,politicsNews,"February 28, 2017",president bush climate trump presidency optimi...,true
1,1,5012,Trump's revised travel ban dealt first court s...,(Reuters) - A federal judge in Wisconsin dealt...,politicsNews,"March 11, 2017",judge blow president ban enforcement policy en...,true
2,2,7126,"Trump outlines plans for first day in office, ...",NEW YORK/WASHINGTON (Reuters) - U.S. President...,politicsNews,"November 21, 2016",york president trump day office trade accord w...,true
3,3,6470,Trump's EPA pick resigns from Rule of Law Defe...,(Reuters) - U.S. President-elect Donald Trump’...,politicsNews,"January 7, 2017",president trump pick protection agency chairma...,true
4,4,13529,Bosnian Croat war crimes convict dies after ta...,THE HAGUE (Reuters) - A former Bosnian Croat m...,worldnews,"November 29, 2017",commander poison war courtroom appeal year pri...,true
...,...,...,...,...,...,...,...,...
1962,962,15911,FORMER FBI ASST DIRECTOR LETS IT RIP! Comey’s ...,James Kallstrom is the former Assistant Direct...,Government News,"Nov 9, 2017",assistant director fan director times intellig...,fake
1963,963,19785,ELECTION WHISTLEBLOWER: DOJ In Cahoots With De...,J. Christian Adams: Dead people are voting and...,left-news,"Oct 18, 2016",people something administration anything voter...,fake
1964,964,15379,THE FIX IS IN: JUDGE QUICKLY BLOCKS ABORTION V...,"Judge William H. Orrick, III joins Obama in hi...",politics,"Aug 1, 2015",judge desire parenthood abortion order release...,fake
1965,965,20445,SCREAMING LEFTISTS Interrupt Trump Speech…Crow...,Screaming leftists interrupted Donald Trump s ...,left-news,"Jun 10, 2016",trump speech today faith freedom conference tr...,fake


## Classify by subjects

### Prepare Train and Test Data sets

Split dataset into two, training and test. 
The training data set will be used to fit the model and the predictions will be performed on the test data set.This can be done through the train_test_split from the sklearn library. The Training Data will have 70% of the corpus and Test data will have the remaining 30% as we have set the parameter test_size=0.3 .

In [187]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(news['tokenize_text'], news['subject'],test_size=0.3)

### Encoding
Label encode the target variable — This is done to transform Categorical data of string type in the data set into numerical values which the model can understand.

In [188]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)  

### Feature Extraction with TF-IDF

In [189]:
from sklearn.feature_extraction.text import TfidfVectorizer
#text = news['tokenize_text'].head(10).values.astype('U')

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(news['tokenize_text'].values.astype('U'))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [190]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


Naive Bayes Accuracy Score ->  56.683587140439926


In [191]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  67.68189509306261


## Classify to fake/true news

### Prepare Train and Test datasets and Encoding

In [192]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(news['tokenize_text'], news['fake/true'],test_size=0.3)

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

### Feature Extraction with TF-IDF

In [193]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(news['tokenize_text'].values.astype('U'))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


### Naive Bayes

In [194]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


Naive Bayes Accuracy Score ->  87.64805414551607


### SVM

In [195]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  90.18612521150592
