In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np


# nltk downloads
nltk.download('punkt')
nltk.download('stopwords')

def load_random_sample(file_path, chunk_size, total_samples, random_state=42):
    sample = pd.DataFrame()
    total_rows = 0

    try:
        for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
            print(f"Processing chunk: Total rows processed so far: {total_rows}")
            sample_chunk = chunk.sample(n=min(len(chunk), total_samples - total_rows), random_state=random_state)
            sample = pd.concat([sample, sample_chunk], ignore_index=True)
            total_rows += len(sample_chunk)
            if total_rows >= total_samples:
                break
    except Exception as e:
        print(f"Error occurred: {e}")
        print(f"Error occurred after processing {total_rows} rows.")
    
    if sample.empty:
        print(f"No data loaded from file: {file_path}")
    return sample

# Set the number of samples you want and the chunk size
total_samples = 1000  # Number of reviews you want to sample
chunk_size = 1000     # Size of each chunk to process, can be the same as total_samples


# Paths to category files
categories = {
    #'Luxury_Beauty': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Luxury_Beauty.json',
    'AMAZON_FASHION': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/AMAZON_FASHION.json',
    'Clothing_Shoes_and_Jewelry': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Clothing_Shoes_and_Jewelry.json',
    'All_Beauty': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/All_Beauty.json',
    'Appliances': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Appliances.json',
    'Toys_and_Games': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Toys_and_Games.json',
    'Arts_Crafts_and_Sewing': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Arts_Crafts_and_Sewing.json',
    'Grocery_and_Gourmet_Food': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Grocery_and_Gourmet_Food.json',
    'Tools_and_Home_Improvement': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Tools_and_Home_Improvement.json'
}


# Load random samples for each category
all_samples = pd.DataFrame()
for category, path in categories.items():
    df = load_random_sample(path, chunk_size, total_samples)
    all_samples = pd.concat([all_samples, df], ignore_index=True)



# Preprocess the data as needed
def preprocess_text(text):
    # Cleaning: Remove HTML tags and punctuation
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Normalization: Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Stopword Removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)


# Assume 'reviewText' is the column with the review texts and 'overall' is the ratings
X = all_samples['reviewText']
y = all_samples['overall']

# Split the data equally into training and test sets (test_size=0.2- 80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Define your pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocess_text)),
    ('classifier', LogisticRegression(random_state=42))
])



# Define the parameter grid
param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__multi_class': ['ovr', 'multinomial'],
    'classifier__max_iter': [100, 200, 500],
    'classifier__tol': [1e-4, 1e-3, 1e-2]
}




# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Now that the model is fitted, we can access the best estimator
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Best Parameters:", grid_search.best_params_)


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Use the full pipeline as the estimator
estimator = grid_search.best_estimator_

# Plot learning curve using the best estimator
plot_learning_curve(best_model, 'Learning Curve', X_train, y_train, cv=5, n_jobs=1)

# Display the plot
plt.show()