In [62]:
#import libraries
!pip install shap
import shap
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.ensemble
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
from nltk.tokenize import word_tokenize
import time

plot_shap=True
shap.initjs()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Importing Data

###20 newsgroups

In [63]:
#category selection
categories = ['alt.atheism', 'soc.religion.christian']

#fetching data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)


In [64]:
#putting data in dataframe
newsgroups_train_df = pd.DataFrame({'Text' : newsgroups_train.data})
newsgroups_train_df['Target'] = newsgroups_train.target

newsgroups_test_df = pd.DataFrame({'Text' : newsgroups_test.data})
newsgroups_test_df['Target'] = newsgroups_test.target

## 1 - atheist, 0 - christian
newsgroups_train_df.head()

Unnamed: 0,Text,Target
0,From: nigel.allen@canrem.com (Nigel Allen)\nSu...,1
1,From: marshall@csugrad.cs.vt.edu (Kevin Marsha...,0
2,From: tedr@athena.cs.uga.edu (Ted Kalivoda)\nS...,1
3,From: keith@cco.caltech.edu (Keith Allan Schne...,0
4,From: mayne@ds3.scri.fsu.edu (Bill Mayne)\nSub...,1


###IMDB

In [65]:
imdb_df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
imdb_df.sentiment = (imdb_df.sentiment == "positive").astype("int")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [66]:
#traintest split
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(imdb_df["review"], imdb_df["sentiment"], test_size=0.2)

#Data Cleaning/Transformation

In [67]:
#depricated
def replace_unwanted_chars(text):
    #replacing any unwanted chars
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

In [68]:
from string import punctuation
import re

#cleaning data method
def clean(text_list):
  clean_data = []
  for text in text_list:
    #lowercase
    text=text.lower()
    #remove non-alphanumeric
    text = re.sub('\W+',' ', text )
    clean_data.append(text)

  return clean_data

In [93]:
news_vectorizer = TfidfVectorizer(min_df=0.001)
imdb_vectorizer = TfidfVectorizer(min_df=0.001)

###20 newsgroups

In [100]:
newsgroups_train_df["Clean_Text"] = clean(newsgroups_train_df["Text"])
newsgroups_test_df["Clean_Text"] = clean(newsgroups_test_df["Text"])

start = time.time()
X_train_news_vector = news_vectorizer.fit_transform(newsgroups_train_df["Clean_Text"])
X_test_news_vector = news_vectorizer.transform(newsgroups_test_df["Clean_Text"])
stop = time.time()

news_vectorization_time = (stop-start)

X_train_news_vector[0]

<1x10709 sparse matrix of type '<class 'numpy.float64'>'
	with 250 stored elements in Compressed Sparse Row format>

###IMDB

In [101]:
clean_X_train_imdb = clean(X_train_imdb)
clean_X_text_imdb = clean(X_test_imdb)

start = time.time()
X_train_imdb_vector = imdb_vectorizer.fit_transform(clean_X_train_imdb)
X_test_imdb_vector = imdb_vectorizer.transform(clean_X_text_imdb)
stop = time.time()

imdb_vectorization_time = (stop-start)

X_train_imdb_vector[0]

<1x10206 sparse matrix of type '<class 'numpy.float64'>'
	with 87 stored elements in Compressed Sparse Row format>

#Training 

###20 newsgroups

In [102]:
news_model = SGDClassifier(loss="log", penalty="l2")
start = time.time()
news_model.fit(X_train_news_vector, newsgroups_train_df["Target"])
stop = time.time()

news_train_time = (stop-start)

###IMDB

In [103]:
imdb_model = SGDClassifier(loss="log", penalty="l2")
start = time.time()
imdb_model.fit(X_train_imdb_vector, y_train_imdb)
stop = time.time()

imdb_train_time = (stop-start)

#Results and Visualization

In [104]:
import random
def get_rand(limit):
  return random.randint(0, limit)

###20 newsgroups

In [116]:
#time taken for vectorization and training
print(news_vectorization_time + news_train_time)

0.49720215797424316


In [106]:
#accuracy
y_pred_news = news_model.predict(X_test_news_vector)
news_acc = sklearn.metrics.accuracy_score(y_pred_news, newsgroups_test_df["Target"])
mis_num = round(len(newsgroups_test_df["Target"])*(1-news_acc))
print("The accuracy of a linear model using stochiastic gradient descent on the 20newsgroups dataset is: ", news_acc)
print("The number of misclassified instances is: ", mis_num)

The accuracy of a linear model using stochiastic gradient descent on the 20newsgroups dataset is:  0.9316596931659693
The number of misclassified instances is:  49


In [107]:
#visualization preparation
news_test_array = X_test_news_vector.toarray()

news_explainer = shap.LinearExplainer(news_model, X_train_news_vector, feature_perturbation="interventional")
news_shap_values = news_explainer.shap_values(X_test_news_vector)

The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


In [108]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(newsgroups_test_df["Target"]))

print("True Value: ", "athiest" if newsgroups_test_df["Target"][vis_index]==0 else "christian")
print("Predicted Value: ", "athiest" if y_pred_news[vis_index]==0 else "christian")
shap.initjs()
shap.force_plot(
    news_explainer.expected_value, news_shap_values[vis_index,:], news_test_array[vis_index,:],
    feature_names=news_vectorizer.get_feature_names_out()
)

True Value:  christian
Predicted Value:  christian


In [109]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(newsgroups_test_df["Target"]))

print("True Value: ", "athiest" if newsgroups_test_df["Target"][vis_index]==0 else "christian")
print("Predicted Value: ", "athiest" if y_pred_news[vis_index]==0 else "christian")
shap.initjs()
shap.force_plot(
    news_explainer.expected_value, news_shap_values[vis_index,:], news_test_array[vis_index,:],
    feature_names=news_vectorizer.get_feature_names_out()
)

True Value:  christian
Predicted Value:  christian


###IMDB

In [117]:
#time taken for vectorization and training
print(imdb_vectorization_time + imdb_train_time)

8.95344591140747


In [111]:
#accuracy
y_pred_imdb = imdb_model.predict(X_test_imdb_vector)
news_acc = sklearn.metrics.accuracy_score(y_pred_imdb, y_test_imdb)
mis_num = round(len(y_test_imdb)*(1-news_acc))
print("The accuracy of a linear model using stochiastic gradient descent on the IMDB dataset is: ", news_acc)
print("The number of misclassified instances is: ", mis_num)

The accuracy of a linear model using stochiastic gradient descent on the IMDB dataset is:  0.8796
The number of misclassified instances is:  1204


In [None]:
#resetting index for y_test_imdb
new_index = range(len(y_test_imdb))

y_test_imdb.index = new_index
y_test_imdb

In [113]:
#visualization preparation
imdb_test_array = X_test_imdb_vector.toarray()

imdb_explainer = shap.LinearExplainer(imdb_model, X_train_imdb_vector, feature_perturbation="interventional")
imdb_shap_values = imdb_explainer.shap_values(X_test_imdb_vector)

The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


In [114]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(y_test_imdb))
print(vis_index)
print("True Value: ", "Negative" if y_test_imdb[vis_index]==0 else "Positive")
print("Predicted Value: ", "Negative" if y_pred_imdb[vis_index]==0 else "Positive")
shap.initjs()
shap.force_plot(
    imdb_explainer.expected_value, imdb_shap_values[vis_index,:], imdb_test_array[vis_index,:],
    feature_names=imdb_vectorizer.get_feature_names_out()
)

3252
True Value:  Negative
Predicted Value:  Negative


In [115]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(y_test_imdb))
print(vis_index)
print("True Value: ", "Negative" if y_test_imdb[vis_index]==0 else "Positive")
print("Predicted Value: ", "Negative" if y_pred_imdb[vis_index]==0 else "Positive")
shap.initjs()
shap.force_plot(
    imdb_explainer.expected_value, imdb_shap_values[vis_index,:], imdb_test_array[vis_index,:],
    feature_names=imdb_vectorizer.get_feature_names_out()
)

2579
True Value:  Positive
Predicted Value:  Negative


#Recap

The vectorization and training time on the 20newsgroups dataset is 0.497 seconds.

The accuracy of a SGDClassifier on the 20newsgroups dataset is 93.31%.

The vectorization and training time on the IMDb dataset is 8.953 seconds.

The accuracy of a SGDClassifier on the IMDb dataset is 88.48%.

