In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Import the Support Vector Classification
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    if pd.isnull(text):
        return ''
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_tokens)

def load_data(category_url, chunk_size, total_samples):
    chunks = pd.read_json(category_url, lines=True, chunksize=chunk_size)
    data = pd.concat([chunk for chunk in chunks], ignore_index=True)
    data = data.sample(min(total_samples, len(data)))
    return data

def preprocess_data(data):
    data = data.dropna(subset=['reviewText'])
    data['reviewText'] = data['reviewText'].apply(preprocess_text)
    return data

In [4]:
def train_svm(X_train_tfidf, y_train, param_grid):
    svm_model = SVC()  # Use the default parameters for now
    grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_tfidf, y_train)
    best_params = grid_search.best_params_
    best_model = SVC(**best_params)
    best_model.fit(X_train_tfidf, y_train)
    return best_model

def evaluate_model(model, X, y, set_name):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f'{set_name} Accuracy: {accuracy:.2f}')
    print(f'Classification Report ({set_name}):')
    print(classification_report(y, y_pred))


In [5]:
def main():
    categories = {
        'Luxury_Beauty': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Luxury_Beauty.json.gz',
        'AMAZON_FASHION': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/AMAZON_FASHION.json.gz',
        'All_Beauty': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/All_Beauty.json.gz'
    }
    total_samples = 1000
    chunk_size = 10000
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

    # Load and preprocess training data for AMAZON_FASHION and All_Beauty
    train_data_amazon_fashion = preprocess_data(load_data(categories['AMAZON_FASHION'], chunk_size, total_samples))
    train_data_all_beauty = preprocess_data(load_data(categories['All_Beauty'], chunk_size, total_samples))

    # Combine training data
    train_data = pd.concat([train_data_amazon_fashion, train_data_all_beauty], ignore_index=True)

    # Separate features (reviews) and target variable (ratings)
    X_train = train_data['reviewText']
    y_train = train_data['overall']

    # Convert text data to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)

    # Split training data into training and validation sets
    X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

    # Train SVM model
    best_svm_model = train_svm(X_train_split, y_train_split, param_grid)

    # Print information about the best parameters
    print(f'Best Parameters: {best_svm_model.get_params()}')

    # Evaluate the SVM model on the validation set
    evaluate_model(best_svm_model, X_validation_split, y_validation_split, 'Validation Set')

    # Load and preprocess testing data for Luxury_Beauty
    test_data = preprocess_data(load_data(categories['Luxury_Beauty'], chunk_size, total_samples))

    # Separate features and target variable for testing data
    X_test = test_data['reviewText']
    y_test = test_data['overall']
    X_test_tfidf = vectorizer.transform(X_test)

    # Evaluate the SVM model on the test set
    evaluate_model(best_svm_model, X_test_tfidf, y_test, 'Test Set')

if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviewText'] = data['reviewText'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviewText'] = data['reviewText'].apply(preprocess_text)


Best Parameters: {'C': 1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Validation Set Accuracy: 0.61
Classification Report (Validation Set):
              precision    recall  f1-score   support

           1       0.75      0.32      0.44        57
           2       0.00      0.00      0.00        25
           3       0.31      0.14      0.19        36
           4       0.55      0.09      0.16        64
           5       0.62      0.99      0.76       218

    accuracy                           0.61       400
   macro avg       0.45      0.31      0.31       400
weighted avg       0.56      0.61      0.52       400

Test Set Accuracy: 0.69
Classification Report (Test Set):
              precision    recall  f1-score   support

           1       0.61      0