In [176]:
import pandas as pd
import re
import json
import nltk
import numpy as np
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\U-5058\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U-5058\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [177]:
df =pd.read_csv('../data/amazon_reviews.csv')
df.dtypes

userName        object
verified          bool
itemName        object
description     object
image           object
brand           object
feature         object
category        object
price           object
rating         float64
reviewTime      object
summary         object
reviewText      object
vote             int64
dtype: object

*   Feature 'price' is the only column with signifcent amount of Nan values with 13% of the whole dataset.

In [178]:
df.isnull().sum()/len(df)*100

userName        0.011068
verified        0.000000
itemName        0.000181
description     0.001451
image           0.000000
brand           0.701975
feature         0.000000
category        0.000000
price          13.486852
rating          0.000000
reviewTime      0.000000
summary         0.042456
reviewText      0.071667
vote            0.000000
dtype: float64

Raw dataset contain more than 100K unique users and items.

In [179]:
print('Count of unique users ' + str(df['userName'].nunique()))
print('Count of unique items ' + str(df['itemName'].nunique()))

Count of unique users 131183
Count of unique items 109009


More than 95% are verified

In [180]:
verified_users_percentage = round(len(df[df['verified']==True])/len(df)*100)
print(f'Percentage of verified users is {verified_users_percentage}%')

Percentage of verified users is 95%


In [181]:
# Define a function to convert the price to numeric or NaN
def convert_price_to_numeric(price):
    try:
        #Convert all prices types into string
        price=str(price)
        # Remove the '$' symbol and convert to float
        return float(price.replace('$', ''))
    except ValueError:
        # If conversion fails, return NaN
        return np.nan

#Impute feature by mean
def impute_by_mean(df,col_name):
    # Calculate the mean price excluding NaN values
    mean = df[col_name].mean()
    # Impute NaN values with the mean price
    df[col_name].fillna(mean, inplace=True)
    return df

#Filtering most frequent of specific category feature
def filter_by_high_frequ(data,obj_col_name):
    obj_count = pd.DataFrame(data[obj_col_name].value_counts()).reset_index()
    mean_per_obj = obj_count[obj_col_name].mean()
    #Filter items with more than the mean amount of reviews#
    popular_obj = obj_count[obj_count[obj_col_name]>mean_per_obj]
    filtered = data[data[obj_col_name].isin(popular_obj['index'])].reset_index(drop=True)
    return filtered

# Define a function to check for valid prices
def is_valid_price(price):
    pattern = r'\$\d+'  # Regular expression pattern for valid prices
    return bool(re.match(pattern, price))

# Function to clean text
def clean_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    # Remove stop words
    cleaned_tokens = [word for word in tokens if (word.lower() not in stop_words and word.isalnum())]
    # Join cleaned tokens back into a sentence
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

At this part I have done the following cleaning process
*   Convert and impute price values
*   Filtering only verified users in order to make the modeling more accurate
*   Filtering low frequent categories for the simplicity of the mission and aiming to improve the model accuracy
*   Droping duplicated samples - I have done that by mistake , so i have removed it and run the process again.
*   Using nltk for cleaning textual columns that i will be using for content_based model.
*   Add user_id and item_id

Price cleaning and imputing

In [182]:
# Apply the function to the 'price' column
df['price'] = df['price'].apply(convert_price_to_numeric)
df = impute_by_mean(df,'price')

Filtering verified , high freq and nan samples

In [168]:
#Get only verifed useres
filtered_df = df[df['verified']==True]
#Drop verified column
filtered_df.drop(columns=['verified'],inplace=True)

#Filter most frqu categories from each featuer#
filtered_df = filter_by_high_frequ(filtered_df,'itemName')
filtered_df = filter_by_high_frequ(filtered_df,'brand')
filtered_df = filter_by_high_frequ(filtered_df,'category')
filtered_df = filter_by_high_frequ(filtered_df,'userName')

#Droping all samples that contain nan values#
#It could be more efficient by fill the nan values with most common one , but for the simplicity i have skipped them #
filtered_df = filtered_df.dropna().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Cleaning text

In [169]:
# Replace underscores with spaces in the category column
filtered_df['category'] = filtered_df['category'].str.replace('_', ' ')

# Clean the specified columns - only those will be used for content based model
columns_to_clean = ['itemName', 'brand','category']
for column in columns_to_clean:
    print(f'Cleaning textual column:{column}')
    filtered_df[column] = filtered_df[column].apply(clean_text)

Cleaning textual column:itemName
Cleaning textual column:brand
Cleaning textual column:category


Add user_id and item_id

In [None]:
#Add user_id and item_id to each user and item#
filtered_df['user_id'] = filtered_df['userName'].astype('category').cat.codes
filtered_df['item_id'] = filtered_df['itemName'].astype('category').cat.codes

Keeping on the features that I will focuse on the assigment

In [None]:
#For now i will focus on the itemName brand and category#
columns_to_drop = ['description','feature','reviewTime','image', 'reviewText','summary']
filtered_df.drop(columns=columns_to_drop,inplace=True)
filtered_df.reset_index(drop=True,inplace=True)

Save file of the rating-item-user for applying CF and MF

In [170]:
filtered_df.to_csv('../data/preprocessed.csv')
#For the usage of collaborative_filter SVD#
user_item_rating = filtered_df.loc[:,['user_id','item_id','rating']]
user_item_rating.to_csv('../data/user_item_rating.csv')

Saving dictionaries of the usernames and items ids

In [171]:
#Get all pairs of usernames-ids#
usernames_ids = filtered_df.drop_duplicates(subset=['userName','user_id']).loc[:,['userName','user_id']]
usernames_ids_dict = dict(zip(usernames_ids['userName'],usernames_ids['user_id']))
ids_usernames_dict = dict(zip(usernames_ids['user_id'],usernames_ids['userName']))
#Get all pairs of items-ids#
items_ids = filtered_df.drop_duplicates(subset=['itemName','item_id']).loc[:,['itemName','item_id']]
items_ids_dict = dict(zip(items_ids['itemName'],items_ids['item_id']))
ids_items_dict = dict(zip(items_ids['item_id'],items_ids['itemName']))

# Save dictionary as a JSON file
with open('../outputs/IDs/usernames_ids_dict.json', 'w') as file:
    json.dump(usernames_ids_dict, file)
with open('../outputs/IDs/ids_usernames_dict.json', 'w') as file:
    json.dump(ids_usernames_dict, file)
with open('../outputs/IDs/items_ids_dict.json', 'w') as file:
    json.dump(items_ids_dict, file)
with open('../outputs/IDs/ids_items_dict.json', 'w') as file:
    json.dump(ids_items_dict, file)

items_ids.to_json('../outputs/IDs/items_ids.json')    