SVM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Load the Dataset

In [None]:
import pandas as pd

file_path = '/content/drive/My Drive/Colab Notebooks/PreProcessing/all_kindle_review.csv'
data = pd.read_csv(file_path, delimiter=',', encoding='utf-8')
data.drop_duplicates(inplace=True)

pd.set_option("display.max_columns", None)  # Show all columns
print(data.head())

In [None]:
num_rows_read = data.shape[0]
print(f"Number of rows read: {num_rows_read}")

import subprocess
result = subprocess.run(['wc', '-l', file_path], capture_output=True, text=True)
num_lines_file = int(result.stdout.split()[0]) -1
print(f"Number of lines in file (including header): {num_lines_file}")

if num_rows_read == num_lines_file:
    print("All rows appear to have been read successfully.")
else:
    print("There might be some rows that were not read.")
    print(f"Difference: {num_lines_file - num_rows_read}")

Preprocessing the text data

In [None]:
# Convert ratings into categories (e.g., binary classification for sentiment)
def classify_sentiment(rating):
  if rating is None:  # Check if the rating is None
      return 1
  try:
    rating = int(rating)  # or float(rating) if ratings can be decimals
  except ValueError:
      return "neutral"
  if rating <= 2:
      return "Negative"
  elif rating == 3:
      return "Neutral"
  else:
      return "Positive"
y = data['rating'].apply(classify_sentiment)

In [None]:
pip install spacy

In [None]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")

def clean_and_tokenize(text):
    if not isinstance(text, str):
        return ""
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters (punctuation, special symbols, etc.)
    text = re.sub(r'\W', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    return [token.text for token in doc]

# Apply the function to the 'reviewText' column
data['tokens'] = data['reviewText'].apply(lambda x: clean_and_tokenize(str(x)))

# Drop rows with missing target values if necessary
data.dropna(subset=['rating', 'tokens'], inplace=True)

# Convert the rating into binary sentiment (positive/negative)
data['sentiment'] = data['rating'].apply(classify_sentiment)

# Display the preprocessed data with tokens
print(data[['reviewText', 'tokens', 'sentiment']].head())

data.dropna(subset=['rating','tokens'], inplace=True)

In [None]:
import nltk
nltk.download('punkt_tab')

Feature Extraction using Word2Vec

In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenize the cleaned text
data['tokens'] = data['tokens'].astype(str)
data['tokens'] = data['tokens'].apply(word_tokenize)

# Train a Word2Vec model on the tokenized text
word2vec_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average word2vec vector for a review
def get_avg_word2vec(tokens, model, vector_size):
    valid_words = [word for word in tokens if word in model.wv]
    if len(valid_words) == 0:
        return [0] * vector_size
    return np.mean([model.wv[word] for word in valid_words], axis=0)

# Apply the function to create word2vec feature vectors
import numpy as np
X = np.array([get_avg_word2vec(tokens, word2vec_model, 100) for tokens in data['tokens']])
y = data['sentiment'].values

Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the SVM Model

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Standardize the feature vectors and train SVM
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Train the model
svm_model.fit(X_train, y_train)

Evaluate the Model

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = svm_model.predict(X_test)

#print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Evaluate the model performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
pip install shap

In [None]:
import numpy as np
import shap

# Define a custom explainer
class CustomSVMExplainer(shap.Explainer):
    def __init__(self, model, feature_names):
        # Call the parent class's constructor
        super().__init__(model.predict, feature_names=feature_names)
        self.model = model

    def shap_values(self, X):
        # Get the predictions for the input data
        predictions = self.model.predict(X)

        # Initialize an array to hold SHAP values
        shap_values = np.zeros((X.shape[0], X.shape[1]))

        # Loop through each feature to compute SHAP values
        for i in range(X.shape[1]):
            # Create a copy of the input data
            X_background = X.copy()
            for j in range(X.shape[0]):
                # Change one feature value to compute its impact
                original_value = X_background[j, i]

                # Perturb the feature
                X_background[j, i] = np.mean(X[:, i])  # Replace with the mean for simplicity

                # Get the predictions for the perturbed data
                new_prediction = self.model.predict(X_background[[j]])  # Pass a single sample to predict

                # Calculate the SHAP value as the difference in prediction
                # Corrected indentation here
                shap_values[j, i] = new_prediction[0] - predictions[j]  # Access the first element of new_prediction

                # Restore original value
                X_background[j, i] = original_value

        return shap_values

# Example usage
# Assuming `svm_model` is your trained SVM model and `X_train` is your training data
feature_names = [f'Feature {i + 1}' for i in range(X_train.shape[1])]
custom_explainer = CustomSVMExplainer(svm_model, feature_names)

In [None]:
# Check the shape of the training dataset (80:20 for splitting and training)
num_train_rows, num_train_columns = X_train.shape
print(f"Training dataset - Number of rows: {num_train_rows}, Number of columns: {num_train_columns}")

# Check the shape of the testing dataset
num_test_rows, num_test_columns = X_test.shape
print(f"Testing dataset - Number of rows: {num_test_rows}, Number of columns: {num_test_columns}")


In [None]:
# Generate SHAP values for your test data
shap_values = custom_explainer.shap_values(X_test)

# Visualize SHAP values
shap.summary_plot(shap_values, X_test)

In [None]:
# Assuming `X_train` is your original training data with 100 features

# Access the SVC estimator within the pipeline and set probability to True
svm_model.set_params(svc__probability=True)

# You may need to refit the model after this change
svm_model.fit(X_train, y_train)  # Assuming X_train and y_train are your training data

# Define SHAP explainer with the model and preprocessed training data
# Use the original training data (X_train) instead of the transformed data
explainer = shap.KernelExplainer(svm_model.predict_proba, X_train)

# Calculate SHAP values for the custom input
# Make sure custom_input has 100 features and is preprocessed in the same way as X_train
shap_values = explainer.shap_values(custom_input)

# Visualize SHAP values
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], custom_input)