In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Categories and URLs
categories = {
    'Luxury_Beauty': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Luxury_Beauty.json.gz',
    'AMAZON_FASHION': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/AMAZON_FASHION.json.gz',
    'All_Beauty': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/All_Beauty.json.gz'
}

# Settings for chunk size and total samples
chunk_size = 10000
total_samples = 50000

def load_data(category_url, chunk_size, total_samples):
    # Load data in chunks
    chunks = pd.read_json(category_url, lines=True, chunksize=chunk_size)
    data = pd.concat([chunk for chunk in chunks], ignore_index=True)

    # Sample a subset of the data
    data = data.sample(min(total_samples, len(data)))

    return data

# Load training data (AMAZON_FASHION)
train_category_1 = 'AMAZON_FASHION'
train_data_1 = load_data(categories[train_category_1], chunk_size, total_samples)

# Load training data (All beauty)
train_category_2 = 'All_Beauty'
train_data_2 = load_data(categories[train_category_2], chunk_size, total_samples)

# Combine the training data from both categories
train_data = pd.concat([train_data_1, train_data_2], ignore_index=True)

# Handle missing values in the 'reviewText' column
train_data = train_data.dropna(subset=['reviewText'])

# Split training data into training and validation sets
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Load testing data (Luxury_Beauty)
test_category = 'Luxury_Beauty'
test_data = load_data(categories[test_category], chunk_size, total_samples)

# Preprocess data: Separate features (reviews) and target variable (ratings)
X_train = train_data['reviewText']
y_train = train_data['overall']

X_validation = validation_data['reviewText']
y_validation = validation_data['overall']

X_test = test_data['reviewText']
y_test = test_data['overall']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Handle NaN values in the text fields
X_train = X_train.fillna('')
X_validation = X_validation.fillna('')
X_test = X_test.fillna('')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_validation_tfidf = vectorizer.transform(X_validation)
X_test_tfidf = vectorizer.transform(X_test)

# Train Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_validation_tfidf)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_validation, y_val_pred)
print(f'Validation Accuracy: {accuracy_val:.2f}')

# Make predictions on the test set
y_test_pred = model.predict(X_test_tfidf)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy_test:.2f}')

print('\nClassification Report (Test Set):')
print(classification_report(y_test, y_test_pred))