In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
def show_unique_values(df):
    unique_users = df['user'].unique()
    unique_items = df['item'].unique()
    print("Unique users:", len(unique_users))
    print("Unique items:", len(unique_items))

In [3]:
def user_item_to_numeric_val(df):
    unique_users = df['user'].unique()
    unique_items = df['item'].unique()
    
    # users
    user_dict = {}
    all_users = np.arange(1,len(unique_users)+1).tolist()
    for i in range(len(unique_users)):
        user_dict[unique_users[i]] = all_users[i]
    df["user"] = df["user"].map(user_dict)
    
    # items
    item_dict = {}
    all_items = np.arange(1,len(unique_items)+1).tolist()
    for i in range(len(unique_items)):
        item_dict[unique_items[i]] = all_items[i]
    df["item"] = df["item"].map(item_dict)
    return df

In [4]:
def decontract(phrase):
    phrase = re.sub(r"[c|C]an\'t", "can not", phrase)
    phrase = re.sub(r"[w|W]on\'t", "will not", phrase)
    phrase = re.sub(r"[a|A]in\'t", "am not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"N\'T", " NOT", phrase)
    return phrase

In [5]:
def remove_non_english_rows(df):
    non_eng_list = []
    for i in range(len(df)):
        if 'Â' in df['reviews'].iloc[i]:
            print(df['reviews'].iloc[i])
            print(i)
            non_eng_list.append(i)
    return non_eng_list

In [6]:
def preprocess(df, non_eng_list):
#     df["reviews"] = df["reviews"].apply(lambda x:re.sub(r"[c|C]an\'t", "can not", x))
#     df["reviews"] = df["reviews"].apply(lambda x:re.sub(r"[w|W]on\'t", "will not", x))
#     df["reviews"] = df["reviews"].apply(lambda x:re.sub(r"[a|A]in\'t", "am not",x))
    
#     df["reviews"] = df["reviews"].apply(lambda x: re.sub(r"n\'t", " not", x))
#     df["reviews"] = df["reviews"].apply(lambda x: re.sub(r"N\'T", " NOT", x))
    df["reviews"] = df["reviews"].apply(lambda x: re.sub('[0-9]','', x))
    for item in non_eng_list:
        df = df.drop(item) 
        df = df.reset_index()
        df = df.drop(columns=['index'])
    return df

### Hotel Dataset

In [7]:
df1 = pd.read_csv('datasets/Datafiniti_Hotel_Reviews.csv')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10000 non-null  object 
 1   dateAdded             10000 non-null  object 
 2   dateUpdated           10000 non-null  object 
 3   address               10000 non-null  object 
 4   categories            10000 non-null  object 
 5   primaryCategories     10000 non-null  object 
 6   city                  10000 non-null  object 
 7   country               10000 non-null  object 
 8   keys                  10000 non-null  object 
 9   latitude              10000 non-null  float64
 10  longitude             10000 non-null  float64
 11  name                  10000 non-null  object 
 12  postalCode            10000 non-null  object 
 13  province              10000 non-null  object 
 14  reviews.date          10000 non-null  object 
 15  reviews.dateSeen    

#### Choose specific fields

In [8]:
df1 = df1.filter(['reviews.username', 'name', 'reviews.rating', 'reviews.text'])
df1.rename(columns={"reviews.username":"user", "name":"item", "reviews.rating": "label", 
                   "reviews.text":"reviews", "reviews.title":"reviews_title"}, inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user     10000 non-null  object 
 1   item     10000 non-null  object 
 2   label    10000 non-null  float64
 3   reviews  9999 non-null   object 
dtypes: float64(1), object(3)
memory usage: 312.6+ KB


In [9]:
df1

Unnamed: 0,user,item,label,reviews
0,Paula,Rancho Valencia Resort Spa,5.0,Our experience at Rancho Valencia was absolute...
1,D,Rancho Valencia Resort Spa,5.0,Amazing place. Everyone was extremely warm and...
2,Ron,Rancho Valencia Resort Spa,5.0,We booked a 3 night stay at Rancho Valencia to...
3,jaeem2016,Aloft Arundel Mills,2.0,Currently in bed writing this for the past hr ...
4,MamaNiaOne,Aloft Arundel Mills,5.0,I live in Md and the Aloft is my Home away fro...
...,...,...,...,...
9995,LASH0211,Silver Sands Oceanfront Motel,3.0,It is hard for me to review an oceanfront hote...
9996,Gennaker,Sandy Neck Motel,4.0,"I live close by, and needed to stay somewhere ..."
9997,Amber406,Shilo Inn Suites - Coeur d'Alene,4.0,Rolled in 11:30 laid out heads down woke up to...
9998,donWoodbury,Scottish Inn,1.0,Absolutely terrible..I was told I was being gi...


#### Clean and preprocess data

In [10]:
# Convert user Id and item name to numerical value (starting from 1)
show_unique_values(df1)
df1 = user_item_to_numeric_val(df1)
df1['label'] = df1['label'].astype(int)
df1.head(3)
print(df1.shape)

Unique users: 6942
Unique items: 1670
(10000, 4)


In [11]:
# remove NA rows
df1 = df1.dropna()
df1 = df1.reset_index()
df1 = df1.drop(columns='index')
print("After removing NA rows", df1.shape)

After removing NA rows (9999, 4)


In [12]:
# remove reviews with non english characters
non_eng_list = remove_non_english_rows(df1)
df1 = preprocess(df1, non_eng_list)

Â¶ª„Å®‰∫åÊ≥ä„Åó„Åæ„Åó„Åü„ÄÇÁ´ãÂú∞„ÇÇËâØ„Åè„ÄÅË¶≥ÂÖâ„Å´‰æøÂà©„Åß„Åó„Åü„ÄÇ„Éõ„ÉÜ„É´„ÅÆÂæìÊ•≠Âì°„ÅÆÊñπ„ÅØÁöÜË¶™Âàá„ÅßËâØ„Åã„Å£„Åü„Åß„Åô„ÄÇÈÉ®Â±ã„ÇÇ„É¢„ÉÄ„É≥„Å™ÊÑü„Åò„ÅßÁ∂∫È∫ó„Åß„Åó„Åü„ÄÇ„ÇØ„É™„Çπ„Éû„Çπ„ÅÆÊó•„ÅØ„ÄÅ„Éõ„ÉÜ„É´„ÅÆ„É¨„Çπ„Éà„É©„É≥„Åß„ÅØÁæéÂë≥„Åó„Åù„ÅÜ„Å™„Éñ„ÉÉ„Éï„Çß„Çí„Åó„Å¶„ÅÑ„Åæ„Åó„Åü„Åå„ÄÅÊÆãÂøµ„Å™„Åå„Çâ„ÄÅ‰∫àÁ¥Ñ„ÅÆ„Å™„ÅÑÊàë„ÄÖ„ÅØ„ÄÅÈ£ü„Åπ„Çå„Åæ„Åõ„Çì„Åß„Åó„Åü„ÄÇ
7750


In [14]:
df1

Unnamed: 0,user,item,label,reviews
0,1,1,5,Our experience at Rancho Valencia was absolute...
1,2,1,5,Amazing place. Everyone was extremely warm and...
2,3,1,5,We booked a night stay at Rancho Valencia to ...
3,4,2,2,Currently in bed writing this for the past hr ...
4,5,2,5,I live in Md and the Aloft is my Home away fro...
...,...,...,...,...
9993,6939,1666,3,It is hard for me to review an oceanfront hote...
9994,4157,1667,4,"I live close by, and needed to stay somewhere ..."
9995,6940,1668,4,Rolled in : laid out heads down woke up to con...
9996,6941,1669,1,Absolutely terrible..I was told I was being gi...


In [15]:
df1.to_csv('datasets/preparation_of_datasets/hotel/new_vader/Preprocessed_Hotel_Reviews_dataset.csv', index=False) 

### Amazon Digital Music Dataset

In [16]:
df2 = pd.read_csv("datasets/Amazon_Digital_Music.csv")

In [17]:
df2.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album,1158019200,"09 12, 2006"
1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant,991526400,"06 3, 2001"


In [18]:
df2 = df2.filter(['reviewerID', 'asin', 'overall', 'reviewText'])
df2.rename(columns={"reviewerID":"user", "asin":"item", "overall": "label", 
                   "reviewText":"reviews"}, inplace=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64706 entries, 0 to 64705
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user     64706 non-null  object 
 1   item     64706 non-null  object 
 2   label    64706 non-null  float64
 3   reviews  64705 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.0+ MB


In [19]:
df2.head(2)

Unnamed: 0,user,item,label,reviews
0,A3EBHHCZO6V2A4,5555991584,5.0,"It's hard to believe ""Memory of Trees"" came ou..."
1,AZPWAXJG9OJXV,5555991584,5.0,"A clasically-styled and introverted album, Mem..."


In [20]:
# Convert user Id and item name to numerical value (starting from 1)
show_unique_values(df2)
df2 = user_item_to_numeric_val(df2)
df2['label'] = df2['label'].astype(int)
df2.head(3)
print(df2.shape)

Unique users: 5541
Unique items: 3568
(64706, 4)


In [21]:
# remove NA rows
df2 = df2.dropna()
df2 = df2.reset_index()
df2 = df2.drop(columns='index')
print("After removing NA rows", df2.shape)

After removing NA rows (64705, 4)


In [22]:
# remove reviews with non english characters
non_eng_list = remove_non_english_rows(df2)
df2 = preprocess(df2, non_eng_list)
df2.shape

(64705, 4)

In [23]:
df2.head(4)

Unnamed: 0,user,item,label,reviews
0,1,1,5,"It's hard to believe ""Memory of Trees"" came ou..."
1,2,1,5,"A clasically-styled and introverted album, Mem..."
2,3,1,5,I never thought Enya would reach the sublime h...
3,4,1,5,This is the third review of an irish album I w...


In [24]:
df2.to_csv('datasets/preparation_of_datasets/Amazon_Music/new_vader/Preprocessed_Amazon_Music_dataset.csv', index=False) 

In [25]:
show_unique_values(df2)

Unique users: 5541
Unique items: 3568


### Amazon Video Games Dataset

In [26]:
df3 = pd.read_csv("datasets/Amazon_Video_Games.csv")

In [27]:
df3.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,"06 30, 2013"


In [28]:
df3 = df3.filter(['reviewerID', 'asin', 'overall', 'reviewText'])
df3.rename(columns={"reviewerID":"user", "asin":"item", "overall": "label", 
                   "reviewText":"reviews"}, inplace=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231780 entries, 0 to 231779
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user     231780 non-null  object 
 1   item     231780 non-null  object 
 2   label    231780 non-null  float64
 3   reviews  231736 non-null  object 
dtypes: float64(1), object(3)
memory usage: 7.1+ MB


In [29]:
df3.head(2)

Unnamed: 0,user,item,label,reviews
0,A2HD75EMZR8QLN,700099867,1.0,Installing the game was a struggle (because of...
1,A3UR8NLLY1ZHCX,700099867,4.0,If you like rally cars get this game you will ...


In [30]:
# Convert user Id and item name to numerical value (starting from 1)
show_unique_values(df3)
df3 = user_item_to_numeric_val(df3)
df3['label'] = df3['label'].astype(int)
df3.head(3)
print(df3.shape)

Unique users: 24303
Unique items: 10672
(231780, 4)


In [31]:
# remove NA rows
df3 = df3.dropna()
df3 = df3.reset_index()
df3 = df3.drop(columns='index')
print("After removing NA rows", df3.shape)

After removing NA rows (231736, 4)


In [32]:
# remove reviews with non english characters
non_eng_list = remove_non_english_rows(df3)
print(non_eng_list)
df3 = preprocess(df3, non_eng_list)
df3.shape

[]


(231736, 4)

In [33]:
df3.to_csv('datasets/preparation_of_datasets/Amazon_Video_Games/new_vader/Preprocessed_Amazon_Video_Games.csv', index=False)

In [34]:
show_unique_values(df3)

Unique users: 24303
Unique items: 10672


In [42]:
pip install numba

Collecting numba
  Downloading numba-0.55.1-cp37-cp37m-win_amd64.whl (2.4 MB)
Collecting llvmlite<0.39,>=0.38.0rc1
  Downloading llvmlite-0.38.1-cp37-cp37m-win_amd64.whl (23.2 MB)
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.38.1 numba-0.55.1
Note: you may need to restart the kernel to use updated packages.
