In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# READING THE FILE and creating base dataframe for  Software and Arts_Crafts_and_Sewing revoew comments

# PART1
# Source file path for software reviews
json_file_path = '/Users/anirudhyadav/Documents/PlayArea/EDA/Software.json'

# Read the JSON file and load into the dataframe
df_sowftware = pd.read_json(json_file_path, lines=True)

# Creating the base dataframe 
base_df_software = df_sowftware
# print(base_df_software)
pruned_df_software =base_df_software[['overall','reviewerID','reviewText']]
pruned_df_software = pruned_df_software.dropna()
print(pruned_df_software.head(2))

# *****************************************************#
# Part2
# Source file path for Arts_Crafts_and_Sewing 
json_file_path = '/Users/anirudhyadav/Documents/PlayArea/EDA/Arts_Crafts_and_Sewing.json'

# Read the JSON file and load into the dataframe
df_Arts_Crafts_and_Sewing = pd.read_json(json_file_path, lines=True)

# Creating the base dataframe 
base_df_Arts_Crafts_and_Sewing = df_Arts_Crafts_and_Sewing
pruned_df_Arts_Crafts_and_Sewing = base_df_Arts_Crafts_and_Sewing[['overall','reviewerID','reviewText']]
pruned_df_Arts_Crafts_and_Sewing.dropna()
print(base_df_Arts_Crafts_and_Sewing.head(2))

   overall      reviewerID                                         reviewText
0        4  A240ORQ2LF9LUI  The materials arrived early and were in excell...
1        4  A1YCCU0YRLS0FE  I am really enjoying this book with the worksh...
   overall vote  verified   reviewTime      reviewerID        asin  \
0        5    4      True   11 6, 2013  A3U4E9PIZ8OWH1  0449819906   
1        5  NaN      True  02 28, 2017  A3945D2TJ0PI86  0449819906   

                            style    reviewerName  \
0  {'Format:': ' Kindle Edition'}  Shirley Curtis   
1  {'Format:': ' Kindle Edition'}        M. Smith   

                                          reviewText            summary  \
0  I've read this book already and I've got plans...  A  WONDERFUL BOOK   
1                         Nicely written directions.               Nice   

   unixReviewTime image  
0      1383696000   NaN  
1      1488240000   NaN  


*****************************
Software dataset
*****************************

In [3]:

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(pruned_df_software, test_size=0.2, stratify=pruned_df_software['overall'], random_state=42)

# Define binary classes: class 1 with 'Overall' rating 5 and class 0 for other ratings
train_df_binary = train_df.copy()
test_df_binary = test_df.copy()


In [4]:
train_df_binary['BinaryClass'] = train_df_binary['overall'].apply(lambda x: 1 if x == 5 else 0)
test_df_binary['BinaryClass'] = test_df_binary['overall'].apply(lambda x: 1 if x == 5 else 0)

In [5]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_binary = tfidf_vectorizer.fit_transform(train_df_binary['reviewText'])
X_test_binary = tfidf_vectorizer.transform(test_df_binary['reviewText'])

In [6]:
# Train a multinomial Naïve Bayes model for binary classification
nb_model_binary = MultinomialNB()
nb_model_binary.fit(X_train_binary, train_df_binary['BinaryClass'])

In [7]:
# Predictions for binary classification
predictions_binary = nb_model_binary.predict(X_test_binary)

In [8]:
# Report metrics for binary classification
accuracy_binary = accuracy_score(test_df_binary['BinaryClass'], predictions_binary)
classification_report_binary = classification_report(test_df_binary['BinaryClass'], predictions_binary, target_names=['Class 0', 'Class 1'])

print("Binary Classification Metrics:")
print(f"Accuracy: {accuracy_binary:.4f}")
print("Classification Report:\n", classification_report_binary)

Binary Classification Metrics:
Accuracy: 0.7859
Classification Report:
               precision    recall  f1-score   support

     Class 0       0.77      0.85      0.81     49394
     Class 1       0.81      0.71      0.75     42480

    accuracy                           0.79     91874
   macro avg       0.79      0.78      0.78     91874
weighted avg       0.79      0.79      0.78     91874



In [9]:
# For five-class classification, use 'Overall' ratings as classes
X_train_five = tfidf_vectorizer.fit_transform(train_df['reviewText'])
X_test_five = tfidf_vectorizer.transform(test_df['reviewText'])

# Train a multinomial Naïve Bayes model for five-class classification
nb_model_five = MultinomialNB()
nb_model_five.fit(X_train_five, train_df['overall'])

In [10]:
 # Predictions for five-class classification
predictions_five = nb_model_five.predict(X_test_five)

# Report metrics for five-class classification
accuracy_five = accuracy_score(test_df['overall'], predictions_five)
classification_report_five = classification_report(test_df['overall'], predictions_five)

print("\nFive-Class Classification Metrics:")
print(f"Accuracy: {accuracy_five:.4f}")
print("Classification Report:\n", classification_report_five)


Five-Class Classification Metrics:
Accuracy: 0.6187
Classification Report:
               precision    recall  f1-score   support

           1       0.68      0.74      0.71     20508
           2       0.00      0.00      0.00      6289
           3       0.60      0.00      0.01      7879
           4       0.40      0.01      0.01     14718
           5       0.60      0.98      0.74     42480

    accuracy                           0.62     91874
   macro avg       0.46      0.35      0.29     91874
weighted avg       0.54      0.62      0.50     91874



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*****************************
Arts_Crafts_and_Sewing dataset
*****************************

In [11]:

# Split the dataset into training and testing sets
train_df_acs, test_df_acs = train_test_split(base_df_Arts_Crafts_and_Sewing, test_size=0.2, stratify=base_df_Arts_Crafts_and_Sewing['overall'], random_state=42)

# Define binary classes: class 1 with 'Overall' rating 5 and class 0 for other ratings
train_df_binary_acs = train_df_acs.copy()
test_df_binary_acs = test_df_acs.copy()

In [12]:
train_df_binary_acs['BinaryClass'] = train_df_binary_acs['overall'].apply(lambda x: 1 if x == 5 else 0)
train_df_binary_acs =train_df_binary_acs.dropna()
test_df_binary_acs['BinaryClass'] = test_df_binary_acs['overall'].apply(lambda x: 1 if x == 5 else 0)
test_df_binary_acs =test_df_binary_acs.dropna()

In [13]:
# TF-IDF vectorization
tfidf_vectorizer_acs = TfidfVectorizer()
X_train_binary_acs = tfidf_vectorizer_acs.fit_transform(train_df_binary_acs['reviewText'])
X_test_binary_acs = tfidf_vectorizer_acs.transform(test_df_binary_acs['reviewText'])

In [14]:
# Train a multinomial Naïve Bayes model for binary classification
nb_model_binary_acs = MultinomialNB()
nb_model_binary_acs.fit(X_train_binary_acs, train_df_binary_acs['BinaryClass'])

In [15]:
# Predictions for binary classification
predictions_binary_acs = nb_model_binary_acs.predict(X_test_binary_acs)
# Report metrics for binary classification
accuracy_binary_acs = accuracy_score(test_df_binary_acs['BinaryClass'], predictions_binary_acs)
classification_report_binary_acs = classification_report(test_df_binary_acs['BinaryClass'], predictions_binary_acs, target_names=['Class 0', 'Class 1'])

print("Binary Classification Metrics:")
print(f"Accuracy: {accuracy_binary_acs:.4f}")
print("Classification Report:\n", classification_report_binary_acs)

Binary Classification Metrics:
Accuracy: 0.7629
Classification Report:
               precision    recall  f1-score   support

     Class 0       0.96      0.38      0.54      1398
     Class 1       0.73      0.99      0.84      2372

    accuracy                           0.76      3770
   macro avg       0.84      0.68      0.69      3770
weighted avg       0.81      0.76      0.73      3770



In [16]:
# For five-class classification, use 'Overall' ratings as classes
X_train_five = tfidf_vectorizer.fit_transform(train_df['reviewText'])
X_test_five = tfidf_vectorizer.transform(test_df['reviewText'])

# Train a multinomial Naïve Bayes model for five-class classification
nb_model_five = MultinomialNB()
nb_model_five.fit(X_train_five, train_df['overall'])

In [17]:
 # Predictions for five-class classification
predictions_five = nb_model_five.predict(X_test_five)

# Report metrics for five-class classification
accuracy_five = accuracy_score(test_df['overall'], predictions_five)
classification_report_five = classification_report(test_df['overall'], predictions_five)

print("\nFive-Class Classification Metrics:")
print(f"Accuracy: {accuracy_five:.4f}")
print("Classification Report:\n", classification_report_five)


Five-Class Classification Metrics:
Accuracy: 0.6187
Classification Report:
               precision    recall  f1-score   support

           1       0.68      0.74      0.71     20508
           2       0.00      0.00      0.00      6289
           3       0.60      0.00      0.01      7879
           4       0.40      0.01      0.01     14718
           5       0.60      0.98      0.74     42480

    accuracy                           0.62     91874
   macro avg       0.46      0.35      0.29     91874
weighted avg       0.54      0.62      0.50     91874



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
