In [1]:
import pandas as pd
import re
import json
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\U-5058\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U-5058\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df =pd.read_csv('../data/amazon_reviews.csv')

In [3]:
df.columns

Index(['userName', 'verified', 'itemName', 'description', 'image', 'brand',
       'feature', 'category', 'price', 'rating', 'reviewTime', 'summary',
       'reviewText', 'vote'],
      dtype='object')

In [4]:
def filter_by_high_frequ(data,obj_col_name):
    obj_count = pd.DataFrame(data[obj_col_name].value_counts()).reset_index()
    mean_per_obj = obj_count[obj_col_name].mean()
    #Filter items with more than the mean amount of reviews#
    popular_obj = obj_count[obj_count[obj_col_name]>mean_per_obj]
    filtered = data[data[obj_col_name].isin(popular_obj['index'])].reset_index(drop=True)
    return filtered

# Define a function to check for valid prices
def is_valid_price(price):
    pattern = r'\$\d+'  # Regular expression pattern for valid prices
    return bool(re.match(pattern, price))

# Function to clean text
def clean_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    # Remove stop words
    cleaned_tokens = [word for word in tokens if (word.lower() not in stop_words and word.isalnum())]
    # Join cleaned tokens back into a sentence
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


At this part I have done the following cleaning process
*   Filtering only verified users in order to make the modeling more accurate
*   Filtering low frequent categories for the simplicity of the mission and aiming to improve the model accuracy
*   Filtering only valid prices items.
*   Droping duplicated samples.
*   Using nltk for cleaning textual columns that i will be using for content_based model.
*   Add user_id and item_id

In [5]:
#Get only verifed useres
filtered_df = df[df['verified']==True]
#Drop verified column
filtered_df.drop(columns=['verified'],inplace=True)

#Filter most frqu categories from each featuer#
filtered_df = filter_by_high_frequ(filtered_df,'itemName')
filtered_df = filter_by_high_frequ(filtered_df,'brand')
filtered_df = filter_by_high_frequ(filtered_df,'category')
filtered_df = filter_by_high_frequ(filtered_df,'userName')

#Droping all samples that contain nan values#
#It could be more efficient by fill the nan values with most common one , but for the simplicity i have skipped them #
filtered_df = filtered_df.dropna().reset_index(drop=True)

#Filter sample where the price is not in the correct format
filtered_df = filtered_df[filtered_df['price'].apply(is_valid_price)]

#Droping duplicated samples
filtered_df = filtered_df.drop_duplicates().reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
# Replace underscores with spaces in the category column
filtered_df['category'] = filtered_df['category'].str.replace('_', ' ')

# Clean the specified columns - only those will be used for content based model
columns_to_clean = ['itemName', 'brand','category']
for column in columns_to_clean:
    print(f'Cleaning textual column:{column}')
    filtered_df[column] = filtered_df[column].apply(clean_text)
    
#Add user_id and item_id to each user and item#
filtered_df['user_id'] = filtered_df['userName'].astype('category').cat.codes
filtered_df['item_id'] = filtered_df['itemName'].astype('category').cat.codes


#Drop reviews and images columns, cause i will focuse on items textual properties#
#In addition I have dropped description and featue for simplicity , even though they can add information about the item#
#For now i will focus on the itemName brand and category#
columns_to_drop = ['description','feature','reviewTime','image', 'reviewText','summary']
filtered_df.drop(columns=columns_to_drop,inplace=True)
filtered_df.reset_index(drop=True,inplace=True)

filtered_df.to_csv('../data/preprocessed.csv')

Cleaning textual column:itemName
Cleaning textual column:brand
Cleaning textual column:category


Save file of the rating-item-user for applying CF and MF

In [7]:
#For the usage of collaborative_filter SVD#
user_item_rating = filtered_df.loc[:,['user_id','item_id','rating']]
user_item_rating.to_csv('../data/user_item_rating.csv')

Saving dictionaries of the usernames and items ids

In [8]:
#Get all pairs of usernames-ids#
usernames_ids = filtered_df.drop_duplicates(subset=['userName','user_id']).loc[:,['userName','user_id']]
usernames_ids_dict = dict(zip(usernames_ids['userName'],usernames_ids['user_id']))
ids_usernames_dict = dict(zip(usernames_ids['user_id'],usernames_ids['userName']))
#Get all pairs of items-ids#
items_ids = filtered_df.drop_duplicates(subset=['itemName','item_id']).loc[:,['itemName','item_id']]
items_ids_dict = dict(zip(items_ids['itemName'],items_ids['item_id']))
ids_items_dict = dict(zip(items_ids['item_id'],items_ids['itemName']))

# Save dictionary as a JSON file
with open('../outputs/IDs/usernames_ids_dict.json', 'w') as file:
    json.dump(usernames_ids_dict, file)
with open('../outputs/IDs/ids_usernames_dict.json', 'w') as file:
    json.dump(ids_usernames_dict, file)
with open('../outputs/IDs/items_ids_dict.json', 'w') as file:
    json.dump(items_ids_dict, file)
with open('../outputs/IDs/ids_items_dict.json', 'w') as file:
    json.dump(ids_items_dict, file)

items_ids.to_json('../outputs/IDs/items_ids.json')    