# Import statements

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
news = pd.read_csv('newdatasetfinal.csv')
news.head()

Unnamed: 0,title,NewCategory
0,Bitcoin is down 60 percent this year. Here's w...,Business & Finance
1,6 health problems marijuana could treat better...,Crime
2,9 charts that explain the history of global we...,Business & Finance
3,Remember when legal marijuana was going to sen...,Crime
4,Obamacare succeeded for one simple reason: it'...,Technology & Health


In [3]:
news['NewCategory'].unique() # unique category labels

array(['Business & Finance', 'Crime', 'Technology & Health', 'Politics',
       'Entertainment'], dtype=object)

In [4]:
news['title'] = news['title'].str.replace('[^\w\s]','').str.lower() # unpunctuate and lower case
news['NewCategory'] = news['NewCategory'].str.replace('[^\w\s]','').str.lower() # unpunctuate and lower case

# Vectorization

In [5]:
# convert data to vectors
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(news['title'])

y = news['NewCategory']

# Spiltting the Data into Training and Testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 30% split

# Training SGD Classifier

In [7]:
sgd = SGDClassifier(n_jobs=-1, penalty='l2', n_iter=500, random_state=1234)

# hyperparameters for tuning
sgd_grid = [{'loss': ['hinge', 'log', 'squared_hinge'],
             'alpha': [0.0001, 0.0001, 0.00001]}]

# grid search with cross validation
sgd_search = GridSearchCV(estimator=sgd, param_grid=sgd_grid, cv=10, refit=True)
sgd_search.fit(X_train, y_train)







GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=500, n_iter_no_change=5, n_jobs=-1, penalty='l2',
       power_t=0.5, random_state=1234, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'loss': ['hinge', 'log', 'squared_hinge'], 'alpha': [0.0001, 0.0001, 1e-05]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
y_pred = sgd_search.predict(X_test)


In [9]:
sgd_search.best_estimator_.score(X_test, y_test)

0.8248015873015873

# Hyperparameter Tuning

In [20]:
sgd = SGDClassifier(n_jobs=-1, penalty='elasticnet', n_iter=500, random_state=1234 )

# hyperparameters for tuning
sgd_grid = [{'loss': ['hinge', 'log', 'squared_hinge'],
             'alpha': [0.0001, 0.0001, 0.00001]}]

# grid search with cross validation
sgd_search = GridSearchCV(estimator=sgd, param_grid=sgd_grid, cv=50, refit=True)
sgd_search.fit(X_train, y_train)





























GridSearchCV(cv=50, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=500, n_iter_no_change=5, n_jobs=-1, penalty='elasticnet',
       power_t=0.5, random_state=1234, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'loss': ['hinge', 'log', 'squared_hinge'], 'alpha': [0.0001, 0.0001, 1e-05]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
y_pred = sgd_search.predict(X_test)

# Performance Metric

1. Accuracy

In [27]:
# Train a new classifier using the best parameters found by the grid search
print("Accuracy of SGD Classifier model:", sgd_search.best_estimator_.score(X_test, y_test)*100)

Accuracy of SGD Classifier model: 82.34126984126983


2. F1 Score

In [25]:
from sklearn.metrics import f1_score
print("F1 Score for all the categories:")
f1_score(y_test, y_pred, average=None)

F1 Score for all the categories:


array([0.82772021, 0.70049751, 0.82015168, 0.85980254, 0.82056075])

3. Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 639,    9,   36,   58,   43],
       [  15,  352,   44,   97,   33],
       [  27,   17,  757,   67,   59],
       [  42,   64,   30, 1524,   64],
       [  36,   22,   52,   75,  878]], dtype=int64)

4. Precision-Recall

In [29]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

 business  finance       0.84      0.81      0.83       785
             crime       0.76      0.65      0.70       541
     entertainment       0.82      0.82      0.82       927
          politics       0.84      0.88      0.86      1724
technology  health       0.82      0.83      0.82      1063

         micro avg       0.82      0.82      0.82      5040
         macro avg       0.82      0.80      0.81      5040
      weighted avg       0.82      0.82      0.82      5040



References
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
