<a href="https://colab.research.google.com/github/ShovalBenjer/Natural_Language_Proccessing_NLP_Projects/blob/main/MultiOneHotEncoder_Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# -*- coding: utf-8 -*-
"""Amazon_Reviews_Sentiment_Analysis.ipynb (Corrected Again - Single Class Error Debug) - Corrected Version

Automatically generated by Colaboratory.
"""

import re
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import math
import nltk
from nltk.corpus import opinion_lexicon
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import csv
import os

# Download required NLTK resources (if not already downloaded)
nltk.download('opinion_lexicon')
nltk.download('punkt')

# Load the dataset
file_path = 'amazon_books_Data.csv'

if os.path.exists(file_path):
    try:
        df = pd.read_csv(file_path, nrows=1000, quoting=csv.QUOTE_ALL, escapechar='\\')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file_path, nrows=100, encoding='latin-1', quoting=csv.QUOTE_ALL, escapechar='\\')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, nrows=100, encoding='cp1252', quoting=csv.QUOTE_ALL, escapechar='\\')
else:
    raise FileNotFoundError(f"The file {file_path} does not exist. Please make sure it's in the current directory.")

print(df.head())
print(df.columns)
print(df.isnull().sum())

df.dropna(subset=['review_body', 'star_rating'], inplace=True)
df['overall'] = df['star_rating']

def safe_int_conversion(value):
    try:
        return int(float(value))
    except (ValueError, TypeError):
        return -1

df['overall'] = df['overall'].apply(safe_int_conversion)
df = df[df['overall'] != -1]

print("\nDistribution of 'overall' ratings BEFORE sentiment conversion:")
print(df['overall'].value_counts())

df['sentiment'] = df['overall'].apply(lambda x: 1 if x >= 4 else 0)

print("\nDistribution of 'sentiment' AFTER sentiment conversion:")
print(df['sentiment'].value_counts())
print("\nClass distribution BEFORE train_test_split:")
print(df['sentiment'].value_counts())

"""## Part 1: Regular Expressions for Tokenization"""

# Pre-compile the regex pattern for efficiency (outside the function for better re-use if tokenize is called many times)
TOKEN_PATTERN = re.compile(r"[\w]+(?:[-'\w]{0,3}[\w]+)?|[.!?,\";]")

def tokenize(text):
    """
    Tokenizes the input text based on the specified rules using a pre-compiled regex.

    Args:
        text: The input text string.

    Returns:
        A list of tokens.
    """
    if pd.isna(text) or text is None:
        return []
    text = str(text).lower()
    tokens = TOKEN_PATTERN.findall(text)
    return tokens

print("Tokenization of the first 3 reviews:")
for i in range(min(3, len(df))):
    review = df['review_body'].iloc[i]
    tokens = tokenize(review)
    print(f"Review {i+1}: {tokens}")

"""## Part 2: Applying a Lexicon for Sentiment Classification"""

def lexicaScore(lexicon_pos, lexicon_neg, tokens):
    """
    Calculates the relative frequency of positive and negative words in the tokens.

    Args:
        lexicon_pos: A set of positive words.
        lexicon_neg: A set of negative words.
        tokens: A list of tokens.

    Returns:
        A dictionary with 'pos' and 'neg' keys representing relative frequencies.
    """
    pos_count = 0
    neg_count = 0
    total_count = len(tokens)

    if total_count == 0:
        return {'pos': 0.0, 'neg': 0.0}

    for token in tokens:
        if token in lexicon_pos:
            pos_count += 1
        elif token in lexicon_neg:
            neg_count += 1

    return {'pos': pos_count / total_count, 'neg': neg_count / total_count}

positive_lexicon = set(opinion_lexicon.positive())
negative_lexicon = set(opinion_lexicon.negative())

def predict_sentiment(lexicon_pos, lexicon_neg, tokens, thresh_pos=0.05, thresh_neg=0.05):
    """
    Predicts sentiment based on lexicon scores and thresholds.
    Returns 1 for positive, 0 for negative, or defaults to majority class for neutral cases.
    """
    scores = lexicaScore(lexicon_pos, lexicon_neg, tokens)
    if scores['pos'] > thresh_pos and scores['pos'] > scores['neg']:
        return 1  # Positive
    elif scores['neg'] > thresh_neg and scores['neg'] > scores['pos']:
        return 0  # Negative
    else:
        if df['sentiment'].mean() > 0.5:  # Default to majority class (positive if majority)
            return 1
        else:
            return 0

print("\nLexicon based sentiment analysis evaluation:")
correct_predictions = 0
num_examples = 0

for i in range(len(df)):
    review = df['review_body'].iloc[i]
    true_sentiment = df['sentiment'].iloc[i]
    tokens = tokenize(review)
    predicted_sentiment = predict_sentiment(positive_lexicon, negative_lexicon, tokens)

    if predicted_sentiment == true_sentiment:
        correct_predictions += 1
    num_examples += 1

    if num_examples >= 50:
        break

    if (num_examples%10) == 0:
      accuracy = (correct_predictions / num_examples) * 100
      print(f"  After {num_examples} examples, accuracy: {accuracy:.2f}%")

print(f"Final accuracy over {num_examples} examples: {(correct_predictions / num_examples) * 100:.2f}%")


print("\nFinding minimal examples for >70% accuracy (Lexicon method):")
correct_predictions = 0
num_examples = 0
for i in range(len(df)):
    review = df['review_body'].iloc[i]
    true_sentiment = df['sentiment'].iloc[i]
    tokens = tokenize(review)
    predicted_sentiment = predict_sentiment(positive_lexicon, negative_lexicon, tokens, thresh_pos=0.02, thresh_neg=0.04) # Experiment with thresholds

    if predicted_sentiment == true_sentiment:
        correct_predictions += 1
    num_examples += 1

    accuracy = (correct_predictions / num_examples) * 100
    if accuracy > 70:
        print(f"  Accuracy exceeded 70% after {num_examples} examples.")
        break
else:
    print(f"  Accuracy did not exceed 70% within the first {num_examples} examples.")

"""## Part 3: Logistic Regression for Sentiment Classification"""

def extractMultiHot(tokens, vocab):
    """
    Creates a multi-hot encoding of the tokens based on the vocabulary.

    Args:
        tokens: A list of tokens.
        vocab: A dictionary mapping words to their indices in the vocabulary.

    Returns:
        A list representing the multi-hot encoding.
    """
    vocab_size = len(vocab)
    multi_hot = [0] * vocab_size
    for token in tokens:
        if token in vocab:
            index = vocab.get(token)
            if index is not None and 0 <= index < vocab_size:
                multi_hot[index] = 1
    return multi_hot

def negative_log_likelihood(ypred, ytrue):
    """
    Calculates the negative log likelihood for a single instance.
    This is NOT normalized by the number of observations in a typical sense.

    Args:
        ypred: Predicted probability (output of logistic regression) for the positive class (1).
        ytrue: True label (0 or 1).

    Returns:
        The negative log likelihood for this instance. Returns None if input is invalid.
    """
    if not (0 <= ypred <= 1 and (ytrue == 0 or ytrue == 1)):
        return None
    epsilon = 1e-15 # For numerical stability
    ypred = max(epsilon, min(1 - epsilon, ypred)) # Clip predicted probability to avoid log(0)

    log_likelihood = ytrue * math.log(ypred) + (1 - ytrue) * math.log(1 - ypred)
    return -log_likelihood


word_counts = Counter()
for review in df['review_body']:
    tokens = tokenize(review)
    word_counts.update(tokens)

vocabulary = {word: i for i, (word, count) in enumerate(word_counts.items()) if count > 5}
print(f"\nVocabulary size: {len(vocabulary)}")

X = []
y = []

for i in range(len(df)):
    review = df['review_body'].iloc[i]
    tokens = tokenize(review)
    multi_hot = extractMultiHot(tokens, vocabulary)
    X.append(multi_hot)
    y.append(df['sentiment'].iloc[i])

X = np.array(X)
y = np.array(y)

print("\nClass distribution BEFORE train_test_split:")
print(pd.Series(y).value_counts())

if len(np.unique(y)) < 2:
    print("ERROR: Only one class found in the data. Cannot perform classification.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print("\nClass distribution in training set:")
    print(pd.Series(y_train).value_counts())

    model = LogisticRegression(solver='liblinear', random_state=42)
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nLogistic Regression Accuracy: {accuracy:.4f}")

    total_log_loss = 0
    num_valid_losses = 0
    for i in range(len(y_test)):
        log_loss_val = negative_log_likelihood(y_pred_proba[i], y_test[i]) # Using negative_log_likelihood function
        if log_loss_val is not None:
            total_log_loss += log_loss_val
            num_valid_losses += 1

    if num_valid_losses > 0:
        average_log_loss = total_log_loss / num_valid_losses
        print(f"Average Negative Log Loss: {average_log_loss:.4f}") # Corrected name in output
    else:
        print("No valid log loss values were calculated.")

"""## Conclusion
This concludes the notebook. The last cell shows the accuracy of the logistic Regression model on the test set, this model is more accurate than the lexicon method used.
"""

   Unnamed: 0 market_place customer_id         review_id    product_id  \
0           0         "US"  "25933450"   "RJOVP071AVAJO"  "0439873800"   
1           1         "US"   "1801372"  "R1ORGBETCDW3AI"  "1623953553"   
2           2         "US"   "5782091"   "R7TNRFQAOUTX5"  "142151981X"   
3           3         "US"  "32715830"  "R2GANXKDIFZ6OI"  "014241543X"   
4           4         "US"  "14005703"  "R2NYB6C3R8LVN6"  "1604600527"   

  product_parent                                   product_title  \
0     "84656342"  "There Was an Old Lady Who Swallowed a Shell!"   
1    "729938122"                                "I Saw a Friend"   
2    "678139048"                          "Black Lagoon, Vol. 6"   
3    "712432151"                                     "If I Stay"   
4    "800572372"                       "Stars 'N Strips Forever"   

  product_category  star_rating  helpful_votes  total_votes     vine  \
0          "Books"            1              0            0  0 \t(N)   
1 

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'## Conclusion\nThis concludes the notebook. The last cell shows the accuracy of the logistic Regression model on the test set, this model is more accurate than the lexicon method used.\n'

In [7]:
# -*- coding: utf-8 -*-
"""Amazon_Reviews_Sentiment_Analysis.ipynb (Corrected Again - Single Class Error Debug V2)"""

import re
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import math
import nltk
from nltk.corpus import opinion_lexicon
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import csv
import os

nltk.download('opinion_lexicon')
nltk.download('punkt')

file_path = 'amazon_books_Data.csv'

if os.path.exists(file_path):
    try:
        df = pd.read_csv(file_path, nrows=1000, quoting=csv.QUOTE_ALL, escapechar='\\')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file_path, nrows=1000, encoding='latin-1', quoting=csv.QUOTE_ALL, escapechar='\\')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, nrows=1000, encoding='cp1252', quoting=csv.QUOTE_ALL, escapechar='\\')
else:
    raise FileNotFoundError(f"File not found: {file_path}")

print(df.head())
print(df.columns)
print(df.isnull().sum())

df.dropna(subset=['review_body', 'star_rating'], inplace=True)
df['overall'] = df['star_rating']

def safe_int_conversion(value):
    try:
        return int(float(value))
    except (ValueError, TypeError):
        return -1

df['overall'] = df['overall'].apply(safe_int_conversion)
df = df[df['overall'] != -1]

# *** NEW DEBUGGING OUTPUT: Distribution of original star_rating ***
print("\nDistribution of original 'star_rating' column:")
print(df['star_rating'].value_counts())

# *** DEBUGGING OUTPUT: Distribution of 'overall' ratings BEFORE sentiment conversion ***
print("\nDistribution of 'overall' ratings BEFORE sentiment conversion (after safe_int):")
print(df['overall'].value_counts())

# Sentiment conversion
df['sentiment'] = df['overall'].apply(lambda x: 1 if x >= 4 else 0)

# *** DEBUGGING OUTPUT: Distribution of 'sentiment' AFTER sentiment conversion ***
print("\nDistribution of 'sentiment' AFTER sentiment conversion:")
print(df['sentiment'].value_counts())

# Check class distribution before split
print("\nClass distribution BEFORE train_test_split:")
print(df['sentiment'].value_counts())


# Part 1, 2, 3 code (rest of your notebook) remains the same from the last corrected version.
# ... (rest of your code from the last corrected version) ...

   Unnamed: 0 market_place customer_id         review_id    product_id  \
0           0         "US"  "25933450"   "RJOVP071AVAJO"  "0439873800"   
1           1         "US"   "1801372"  "R1ORGBETCDW3AI"  "1623953553"   
2           2         "US"   "5782091"   "R7TNRFQAOUTX5"  "142151981X"   
3           3         "US"  "32715830"  "R2GANXKDIFZ6OI"  "014241543X"   
4           4         "US"  "14005703"  "R2NYB6C3R8LVN6"  "1604600527"   

  product_parent                                   product_title  \
0     "84656342"  "There Was an Old Lady Who Swallowed a Shell!"   
1    "729938122"                                "I Saw a Friend"   
2    "678139048"                          "Black Lagoon, Vol. 6"   
3    "712432151"                                     "If I Stay"   
4    "800572372"                       "Stars 'N Strips Forever"   

  product_category  star_rating  helpful_votes  total_votes     vine  \
0          "Books"            1              0            0  0 \t(N)   
1 

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
