# Data Analysis
## Analysis of the users dataset
## At this point, no data from the users set is used in the model!

In [1]:
from pathlib import Path
from src.data.data_reader import DataReader
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing

while str(os.getcwd())[-3:] != 'src':  # Execute from src-directory root
    os.chdir('..')

In [2]:
entries = DataReader._get_entries_from_file(Path('..', 'data', DataReader.EXPECTED_FILES[4]))
RELEVANT_USER_FIELDS = [
    'user_id',
    'name',
    'review_count',
    'friends',
    'useful',
    'funny',
    'cool',
    'fans',
    'compliments'  # Sum of all compliment fields
]

# Combine all compliments
compliment_fields = [
    'compliment_hot',
    'compliment_more',
    'compliment_profile',
    'compliment_cute',
    'compliment_list',
    'compliment_note',
    'compliment_plain',
    'compliment_cool',
    'compliment_funny',
    'compliment_writer',
    'compliment_photos'
]
combined_compliments = DataReader._filter_entries(entries, compliment_fields)
combined_compliments = [sum(x.values()) for x in combined_compliments]
for entry, sum_combined_for_entry in zip(entries, combined_compliments):
    entry['compliments'] = sum_combined_for_entry

filtered_entries = DataReader._filter_entries(entries, RELEVANT_USER_FIELDS)
users = pd.DataFrame.from_records(filtered_entries)
users

Unnamed: 0,user_id,name,review_count,friends,useful,funny,cool,fans,compliments
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",7217,1259,5994,267,2873
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",43091,13066,27281,3138,20631
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",2086,1010,1003,52,585
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",512,330,299,28,136
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",29,15,7,1,4
...,...,...,...,...,...,...,...,...,...
1987892,fB3jbHi3m0L2KgGOxBv6uw,Jerrold,23,,7,0,0,0,0
1987893,68czcr4BxJyMQ9cJBm6C7Q,Jane,1,,0,0,0,0,0
1987894,1x3KMskYxOuJCjRz70xOqQ,Shomari,4,,1,1,0,0,0
1987895,ulfGl4tdbrH05xKzh5lnog,Susanne,2,,0,0,0,0,0


In [3]:
users['friends'] = users['friends'].map(lambda friend_str: friend_str.split(', ') if friend_str else None)
users = users.set_index('user_id')
users = users.rename(columns={'review_count': 'user_review_count'})
users

Unnamed: 0_level_0,name,user_review_count,friends,useful,funny,cool,fans,compliments
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"[NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8q...",7217,1259,5994,267,2873
j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"[ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0...",43091,13066,27281,3138,20631
2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"[LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgy...",2086,1010,1003,52,585
SZDeASXq7o05mMNLshsdIA,Gwen,224,"[enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74V...",512,330,299,28,136
hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"[PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMb...",29,15,7,1,4
...,...,...,...,...,...,...,...,...
fB3jbHi3m0L2KgGOxBv6uw,Jerrold,23,[None],7,0,0,0,0
68czcr4BxJyMQ9cJBm6C7Q,Jane,1,[None],0,0,0,0,0
1x3KMskYxOuJCjRz70xOqQ,Shomari,4,[None],1,1,0,0,0
ulfGl4tdbrH05xKzh5lnog,Susanne,2,[None],0,0,0,0,0


Normalisation of the numerical data

In [4]:
# Get amount of reviews per user
_, reviews, _, _, = DataReader().read_data()

In [5]:
review_count_per_user = reviews.groupby(by=['user_id'])['user_id'].count().rename('amount_of_reviews')
users = users.join(review_count_per_user, on='user_id')
users = users.dropna(subset=['amount_of_reviews']).copy()  # Drop users which have no reviews
# users = users.drop(columns=['amount_of_reviews'])  # This column is now no longer needed
users

Unnamed: 0_level_0,name,user_review_count,friends,useful,funny,cool,fans,compliments
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"[NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8q...",7217,1259,5994,267,2873
j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"[ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0...",43091,13066,27281,3138,20631
2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"[LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgy...",2086,1010,1003,52,585
SZDeASXq7o05mMNLshsdIA,Gwen,224,"[enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74V...",512,330,299,28,136
hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"[PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMb...",29,15,7,1,4
...,...,...,...,...,...,...,...,...
Q5SlTMOwyHq4PIu7Ev-GVg,John,4,[None],0,0,0,0,0
XPNsbtTADPdAsYT-C4xjsA,Jamila,5,[None],1,0,0,0,0
fB3jbHi3m0L2KgGOxBv6uw,Jerrold,23,[None],7,0,0,0,0
1x3KMskYxOuJCjRz70xOqQ,Shomari,4,[None],1,1,0,0,0


In [6]:
column_names_to_normalise = ['useful', 'funny', 'cool', 'fans', 'compliments']

series_to_normalise = [
    users[[column_name, 'user_review_count']]
    .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
    .astype(np.float32)
    for column_name in column_names_to_normalise
]

quantiles = [
    series.quantile(0.99)
    for series in series_to_normalise
]  # For outlier detection

series_to_normalise = [
    series.map(lambda x: quantile if x > quantile else x)
    for quantile, series in zip(quantiles, series_to_normalise)
]  # Outlier removal by editing

normalised_series = [
    pd.Series(
        data =
            preprocessing.MinMaxScaler().fit_transform(
                series.to_numpy().reshape(-1, 1)
            ).flatten(),
        name = f'{column_name}_normalised',
        dtype = np.float16,
    ).set_axis(users.index)  # To relink with the original dataframe
    for column_name, series in zip(column_names_to_normalise, series_to_normalise)
]

  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)
  .apply(lambda row: row[column_name] / row['user_review_count'], axis=1)


In [7]:
users = users.drop(columns=column_names_to_normalise)
pd.concat([users, *normalised_series], axis=1)

Unnamed: 0_level_0,name,user_review_count,friends,useful_normalised,funny_normalised,cool_normalised,fans_normalised,compliments_normalised
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"[NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8q...",1.000000,1.000000,1.000000,0.912598,1.000000
j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"[ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0...",1.000000,1.000000,1.000000,1.000000,1.000000
2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"[LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgy...",0.697266,0.759277,0.753906,0.156372,0.681152
SZDeASXq7o05mMNLshsdIA,Gwen,224,"[enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74V...",0.507812,0.736816,0.667480,0.250000,0.470215
hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"[PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMb...",0.081604,0.094910,0.044312,0.025314,0.039215
...,...,...,...,...,...,...,...,...
Q5SlTMOwyHq4PIu7Ev-GVg,John,4,[None],0.000000,0.000000,0.000000,0.000000,0.000000
XPNsbtTADPdAsYT-C4xjsA,Jamila,5,[None],0.044434,0.000000,0.000000,0.000000,0.000000
fB3jbHi3m0L2KgGOxBv6uw,Jerrold,23,[None],0.067627,0.000000,0.000000,0.000000,0.000000
1x3KMskYxOuJCjRz70xOqQ,Shomari,4,[None],0.055542,0.125000,0.000000,0.000000,0.000000
