Preprocessing: Tokenize, Lemmatize, remove stopwords

Let's first extract the data we need

In [2]:
import json

with open('reviews.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract the desired information
extracted_data = []

for item in data:
    for review in item['reviews']:
        extracted_review = {
            'review_title': review['review_title'],
            'review_text': review['review_text'],
            'star_ratings': review['star_ratings']
        }
        extracted_data.append(extracted_review)

# Show the first few extracted reviews to verify the process
extracted_data[:3]

[{'review_title': 'Super cute but too small',
  'review_text': 'The item was in great condition and was super cute. An XL didn’t fit me though.',
  'star_ratings': 4},
 {'review_title': 'Second try',
  'review_text': 'I rented these in my usual size last time and they were just too snug. The large was better but the fit was still off for me. Bummed because the material is nice.',
  'star_ratings': 3},
 {'review_title': 'Fun shorts',
  'review_text': 'Loved the color and style. Size up, I’m usually a size 4/small but I wore a medium and they were still a little tight around the waist.',
  'star_ratings': 5}]

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Lowercase and remove punctuation
    tokens = [token.lower() for token in tokens if token.isalpha()]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Apply preprocessing to the review texts
for review in extracted_data:
    review['processed_review_text'] = preprocess_text(review['review_text'])

# Show the processed texts for verification
processed_texts = [review['processed_review_text'] for review in extracted_data]
processed_texts[:3]

[['item', 'great', 'condition', 'super', 'cute', 'xl', 'fit', 'though'],
 ['rented',
  'usual',
  'size',
  'last',
  'time',
  'snug',
  'large',
  'better',
  'fit',
  'still',
  'bummed',
  'material',
  'nice'],
 ['loved',
  'color',
  'style',
  'size',
  'usually',
  'size',
  'wore',
  'medium',
  'still',
  'little',
  'tight',
  'around',
  'waist']]

Now we have tokenized the text, let's store it to a file:

In [4]:
len(extracted_data)

86682

In [5]:
for item in extracted_data:
    item.pop("review_title", None)  
    item.pop("review_text", None)   

with open("processed_reviews.json", 'w', encoding='utf-8') as file:
    json.dump(extracted_data, file, indent=4)

Bag of Word:

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
reviews_sentences = [" ".join(review["processed_review_text"]) for review in extracted_data]

# Initialize the CountVectorizer
vectorizer = CountVectorizer(min_df=1000)

# Fit and transform the sentences to Bag of Words
X_bow = vectorizer.fit_transform(reviews_sentences)

# Convert the result to an array
bow_array = X_bow.toarray()

# Display the Bag of Words array
import pandas as pd

# Convert the BoW array to a DataFrame for better visualization
bow_df = pd.DataFrame(bow_array, columns=vectorizer.get_feature_names_out())

# Show the DataFrame for a more understandable representation
bow_df

Unnamed: 0,able,absolutely,almost,also,amazing,area,arm,around,back,baggy,...,wide,winter,wish,without,wore,work,worked,worn,would,zipper
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
86679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86680,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split

# Assuming `bow_array` is your BoW representation from the previous steps
# Extract star ratings from your data
star_ratings = [review['star_ratings'] for review in extracted_data]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_array, star_ratings, test_size=0.2, random_state=42)

In [8]:
import numpy as np

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [10]:
predictions = model.predict(X_test)

integer_predictions = np.clip(predictions, 1, 5).astype(int)

# Evaluate the model
mse = mean_squared_error(y_test, integer_predictions)
print(mse)

1.191901713099152


In [11]:
user_input = "The material felt really cheap and it broke after a week."

user_input_processed = preprocess_text(user_input)
print(user_input_processed)
user_input_processed = [" ".join(preprocess_text(user_input))]
user_input_vectorized = vectorizer.transform(user_input_processed)
user_input_prediction = model.predict(user_input_vectorized)
predicted_rating = np.clip(np.round(user_input_prediction), 1, 5).astype(int)
print(f"Predicted Rating: {predicted_rating[0]}")

['material', 'felt', 'really', 'cheap', 'broke', 'week']
Predicted Rating: 3
