# Import statements

In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
news = pd.read_csv('newdatasetfinal.csv')
news.head()

Unnamed: 0,title,NewCategory
0,Bitcoin is down 60 percent this year. Here's w...,Business & Finance
1,6 health problems marijuana could treat better...,Crime
2,9 charts that explain the history of global we...,Business & Finance
3,Remember when legal marijuana was going to sen...,Crime
4,Obamacare succeeded for one simple reason: it'...,Technology & Health


In [3]:
news['NewCategory'].unique() # unique category labels

array(['Business & Finance', 'Crime', 'Technology & Health', 'Politics',
       'Entertainment'], dtype=object)

In [4]:
news['title'] = news['title'].str.replace('[^\w\s]','').str.lower() # unpunctuate and lower case
news['NewCategory'] = news['NewCategory'].str.replace('[^\w\s]','').str.lower() # unpunctuate and lower case

# Vectorization

In [5]:
# convert data to vectors
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(news['title'])

y = news['NewCategory']

# Spiltting the Data into Training and Testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 30% split

# Training SVM Model

In [13]:
from sklearn import svm
clf_svm = svm.LinearSVC()
clf_svm.fit(X_train, y_train)
predicted = clf_svm.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted)

0.7924603174603174

# Hyperparameter Tuning

In [15]:
parameter_candidates = [
  {'C': [1, 10, 100, 1000, 10000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000,10000], 'gamma': [0.1,0.01,0.001, 0.0001,0.00001], 'kernel': ['rbf']},
]

In [16]:
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)

# Train the classifier on data1's feature and target data
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'C': [1, 10, 100, 1000, 10000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000, 10000], 'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
print('Best score for data1:', clf.best_score_) 

Best score for data1: 0.8088110222827012


In [22]:
y_pred = clf.predict(X_test)

In [18]:
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10000
Best Kernel: rbf
Best Gamma: 1e-05


# Performance Metric

1. Accuracy

In [23]:
# Train a new classifier using the best parameters found by the grid search
print("Accuracy of SVM model:", clf.score(X_test, y_test)*100)

Accuracy of SVM model: 81.26984126984127


2. F1 Score

In [24]:
from sklearn.metrics import f1_score
print("F1 Score for all the categories:")
f1_score(y_test, y_pred, average=None)

F1 Score for all the categories:


array([0.80994898, 0.71288889, 0.80487805, 0.84466835, 0.82089552])

3. Confusion Matrix

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 635,   11,   29,   84,   53],
       [  17,  401,   36,  120,   20],
       [  30,   26,  726,   90,   60],
       [  35,   76,   30, 1509,   44],
       [  39,   17,   51,   76,  825]], dtype=int64)

4. Precision-Recall

In [26]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

 business  finance       0.84      0.78      0.81       812
             crime       0.76      0.68      0.71       594
     entertainment       0.83      0.78      0.80       932
          politics       0.80      0.89      0.84      1694
technology  health       0.82      0.82      0.82      1008

         micro avg       0.81      0.81      0.81      5040
         macro avg       0.81      0.79      0.80      5040
      weighted avg       0.81      0.81      0.81      5040



References
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f