In [5]:
#import libraries
!pip install shap
import shap
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.ensemble
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
from nltk.tokenize import word_tokenize

plot_shap=True
shap.initjs()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Importing Data

##20 newsgroups

In [6]:
#category selection
categories = ['alt.atheism', 'soc.religion.christian']

#fetching data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)


In [7]:
#putting data in dataframe
newsgroups_train_df = pd.DataFrame({'Text' : newsgroups_train.data})
newsgroups_train_df['Target'] = newsgroups_train.target

newsgroups_test_df = pd.DataFrame({'Text' : newsgroups_test.data})
newsgroups_test_df['Target'] = newsgroups_test.target

## 1 - atheist, 0 - christian
newsgroups_train_df.head()

Unnamed: 0,Text,Target
0,From: nigel.allen@canrem.com (Nigel Allen)\nSu...,1
1,From: marshall@csugrad.cs.vt.edu (Kevin Marsha...,0
2,From: tedr@athena.cs.uga.edu (Ted Kalivoda)\nS...,1
3,From: keith@cco.caltech.edu (Keith Allan Schne...,0
4,From: mayne@ds3.scri.fsu.edu (Bill Mayne)\nSub...,1


###IMDB

In [8]:
imdb_df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
imdb_df.sentiment = (imdb_df.sentiment == "positive").astype("int")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
#traintest split
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(imdb_df["review"], imdb_df["sentiment"], test_size=0.2)

#Data Cleaning/Transformation

In [10]:
def replace_unwanted_chars(text):
    #replacing any unwanted chars
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

In [11]:
news_vectorizer = TfidfVectorizer(min_df=0.001)
imdb_vectorizer = TfidfVectorizer(min_df=0.001)

###20 newsgroups

In [12]:
newsgroups_train_df["Clean_Text"] = newsgroups_train_df["Text"].apply(replace_unwanted_chars)
newsgroups_test_df["Clean_Text"] = newsgroups_test_df["Text"].apply(replace_unwanted_chars)

X_train_news_vector = news_vectorizer.fit_transform(newsgroups_train_df["Clean_Text"])
X_test_news_vector = news_vectorizer.transform(newsgroups_test_df["Clean_Text"])

X_train_news_vector[0]

<1x10723 sparse matrix of type '<class 'numpy.float64'>'
	with 249 stored elements in Compressed Sparse Row format>

###IMDB

In [14]:
clean_X_train_imdb = X_train_imdb.apply(replace_unwanted_chars)
clean_X_text_imdb = X_test_imdb.apply(replace_unwanted_chars)

X_train_imdb_vector = imdb_vectorizer.fit_transform(clean_X_train_imdb)
X_test_imdb_vector = imdb_vectorizer.transform(clean_X_text_imdb)

X_train_imdb_vector[0]

<1x10125 sparse matrix of type '<class 'numpy.float64'>'
	with 161 stored elements in Compressed Sparse Row format>

#Training 

###20 newsgroups

In [15]:
news_model = SGDClassifier(loss="log", penalty="l2")
news_model.fit(X_train_news_vector, newsgroups_train_df["Target"])


SGDClassifier(loss='log')

###IMDB

In [16]:
imdb_model = SGDClassifier(loss="log", penalty="l2")
imdb_model.fit(X_train_imdb_vector, y_train_imdb)


SGDClassifier(loss='log')

#Results and Visualization

In [17]:
import random
def get_rand(limit):
  return random.randint(0, limit)

###20 newsgroups

In [18]:
#accuracy
y_pred_news = news_model.predict(X_test_news_vector)
news_acc = sklearn.metrics.accuracy_score(y_pred_news, newsgroups_test_df["Target"])
mis_num = round(len(newsgroups_test_df["Target"])*(1-news_acc))
print("The accuracy of a linear model using stochiastic gradient descent on the 20newsgroups dataset is: ", news_acc)
print("The number of misclassified instances is: ", mis_num)

The accuracy of a linear model using stochiastic gradient descent on the 20newsgroups dataset is:  0.9149232914923291
The number of misclassified instances is:  61


In [19]:
#visualization preparation
news_test_array = X_test_news_vector.toarray()

news_explainer = shap.LinearExplainer(news_model, X_train_news_vector, feature_perturbation="interventional")
news_shap_values = news_explainer.shap_values(X_test_news_vector)

The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


In [20]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(newsgroups_test_df["Target"]))

print("True Value: ", "athiest" if newsgroups_test_df["Target"][vis_index]==0 else "christian")
print("Predicted Value: ", "athiest" if y_pred_news[vis_index]==0 else "christian")
shap.initjs()
shap.force_plot(
    news_explainer.expected_value, news_shap_values[vis_index,:], news_test_array[vis_index,:],
    feature_names=news_vectorizer.get_feature_names_out()
)

True Value:  athiest
Predicted Value:  athiest


In [21]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(newsgroups_test_df["Target"]))

print("True Value: ", "athiest" if newsgroups_test_df["Target"][vis_index]==0 else "christian")
print("Predicted Value: ", "athiest" if y_pred_news[vis_index]==0 else "christian")
shap.initjs()
shap.force_plot(
    news_explainer.expected_value, news_shap_values[vis_index,:], news_test_array[vis_index,:],
    feature_names=news_vectorizer.get_feature_names_out()
)

True Value:  christian
Predicted Value:  christian


###IMDB

In [22]:
#accuracy
y_pred_imdb = imdb_model.predict(X_test_imdb_vector)
news_acc = sklearn.metrics.accuracy_score(y_pred_imdb, y_test_imdb)
mis_num = round(len(y_test_imdb)*(1-news_acc))
print("The accuracy of a linear model using stochiastic gradient descent on the IMDB dataset is: ", news_acc)
print("The number of misclassified instances is: ", mis_num)

The accuracy of a linear model using stochiastic gradient descent on the IMDB dataset is:  0.8815
The number of misclassified instances is:  1185


In [23]:
#resetting index for y_test_imdb
new_index = range(len(y_test_imdb))

y_test_imdb.index = new_index
y_test_imdb

0       1
1       0
2       0
3       0
4       1
       ..
9995    0
9996    0
9997    0
9998    1
9999    1
Name: sentiment, Length: 10000, dtype: int64

In [24]:
#visualization preparation
imdb_test_array = X_test_imdb_vector.toarray()

imdb_explainer = shap.LinearExplainer(imdb_model, X_train_imdb_vector, feature_perturbation="interventional")
imdb_shap_values = imdb_explainer.shap_values(X_test_imdb_vector)

The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


In [25]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(y_test_imdb))
print(vis_index)
print("True Value: ", "Negative" if y_test_imdb[vis_index]==0 else "Positive")
print("Predicted Value: ", "Negative" if y_pred_imdb[vis_index]==0 else "Positive")
shap.initjs()
shap.force_plot(
    imdb_explainer.expected_value, imdb_shap_values[vis_index,:], imdb_test_array[vis_index,:],
    feature_names=imdb_vectorizer.get_feature_names_out()
)

5726
True Value:  Negative
Predicted Value:  Negative


In [26]:
#copy paste this section to get multiple visualizations
vis_index = get_rand(len(y_test_imdb))
print(vis_index)
print("True Value: ", "Negative" if y_test_imdb[vis_index]==0 else "Positive")
print("Predicted Value: ", "Negative" if y_pred_imdb[vis_index]==0 else "Positive")
shap.initjs()
shap.force_plot(
    imdb_explainer.expected_value, imdb_shap_values[vis_index,:], imdb_test_array[vis_index,:],
    feature_names=imdb_vectorizer.get_feature_names_out()
)

6680
True Value:  Positive
Predicted Value:  Positive
