In [53]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import csv
from sklearn.preprocessing import LabelEncoder

In [54]:
#locating the dataset
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)

df_trustpilot = pd.read_csv('data/data_trustpilot_first.csv', engine='python')

#Displaying basic values
print(df_trustpilot.columns)
print(df_trustpilot.info())
display(df_trustpilot.head())


Index(['username', 'number_reviews', 'location', 'rating', 'text',
       'date_of_experience', 'date_posted', 'verification', 'subject',
       'answer', 'company'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78404 entries, 0 to 78403
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   username            78401 non-null  object
 1   number_reviews      78404 non-null  int64 
 2   location            78404 non-null  object
 3   rating              78404 non-null  int64 
 4   text                78404 non-null  object
 5   date_of_experience  78404 non-null  object
 6   date_posted         78404 non-null  object
 7   verification        78404 non-null  object
 8   subject             78404 non-null  object
 9   answer              78404 non-null  object
 10  company             78404 non-null  object
dtypes: int64(2), object(9)
memory usage: 6.6+ MB
None


Unnamed: 0,username,number_reviews,location,rating,text,date_of_experience,date_posted,verification,subject,answer,company
0,Rob Crane,2,CA,5,The company rep I worked with made my transact...,2024-06-15,2024-10-23 04:17:44,Redirected,The company rep I worked with made my…,0,Flashbay
1,Pat Anderson,1,US,5,I highly recommend using Flashbay. Immediately...,2024-10-16,2024-10-16 19:34:05,Verified,I highly recommend using Flashbay,0,Flashbay
2,Margarita Orlova,1,CZ,5,I had the pleasure of working with Shelby Gibs...,2024-10-10,2024-10-17 10:27:44,Verified,Great customer service,0,Flashbay
3,Paola Rivas,1,US,5,I had a fantastic experience with Brian Truong...,2024-10-21,2024-10-21 22:38:50,Verified,Outstanding Support and Attentive Service,0,Flashbay
4,Fiona Mckelvey Keenan,3,CA,5,My number-one go-to for computer accessories. ...,2024-07-11,2024-10-23 04:09:05,Not Verified,My number-one go-to for computer…,0,Flashbay


STAGE 1 - CREATING NEW COLUMNS

In [55]:
#convert date_posted and date_of_experience to datetime

df_trustpilot['date_posted'] = pd.to_datetime(df_trustpilot['date_posted'])
df_trustpilot['date_of_experience'] = pd.to_datetime(df_trustpilot['date_of_experience'])

#Create local_hour and local_date_posted
#Fixing the date_posted column
#The date_posted is from the CET timezone, however, there are countries in the list that are in a different timezone.
#That way, if someone posts at 9 pm in New York City, it appears as 3 am CET, potentially skewing the data.
#We need to fix this column so that date_posted represents the original date and time posted, from the view of the user writing the review.
#Maps the timezone difference from the view of the CET timezone
timezone_offsets = {
    # Western Europe (CET/CEST)
    'DE': 0, 'FR': 0, 'IT': 0, 'ES': 0, 'NL': 0, 'BE': 0, 'AT': 0, 'CH': 0, 'SE': 0,
    'DK': 0, 'NO': 0, 'PL': 0, 'CZ': 0, 'PT': 0, 'HU': 0, 'SK': 0, 'SI': 0, 'LU': 0,
    'MT': 0, 'HR': 0, 'BA': 0, 'AL': 0, 'ME': 0, 'RS': 0, 'MK': 0, 'AD': 0, 'LI': 0,
    
    # Eastern Europe
    'FI': 1, 'EE': 1, 'LV': 1, 'LT': 1, 'UA': 1, 'RO': 1, 'BG': 1, 'GR': 1, 'BY': 1,
    'MD': 1, 'TR': 1, 'CY': 1,
    
    # UK and related territories
    'GB': -1, 'IE': -1, 'IM': -1, 'JE': -1, 'GG': -1, 'GI': -1,
    
    # Russia and Central Asia
    'RU': 3, 'KZ': 4, 'UZ': 5, 'KG': 5,
    
    # North America and Greenland
    'US': -7, 'CA': -7, 'MX': -7, 'PR': -6, 'VI': -6, 'GU': 10, 'GL': -3,  # Added Greenland
    
    # Central America and Caribbean
    'GT': -8, 'HN': -8, 'SV': -8, 'NI': -8, 'CR': -8, 'PA': -7,
    'BS': -7, 'CU': -7, 'JM': -7, 'DO': -6, 'BB': -6, 'TT': -6,
    'KN': -6, 'AG': -6, 'VC': -6, 'AI': -6, 'BM': -6, 'KY': -6,
    'TC': -6, 'AW': -6, 'CW': -6, 'BQ': -6, 'SX': -6, 'GP': -6,
    
    # South America
    'BR': -5, 'AR': -5, 'CL': -6, 'CO': -7, 'VE': -6, 'PE': -7,
    'EC': -7, 'BO': -6, 'PY': -6, 'UY': -5,
    
    # East Asia
    'CN': 7, 'JP': 8, 'KR': 8, 'TW': 7, 'MN': 7, 'HK': 7, 'MO': 7,
    
    # Southeast Asia
    'SG': 7, 'MY': 7, 'ID': 7, 'TH': 7, 'VN': 7, 'PH': 7,
    'MM': 6.5, 'KH': 7, 'BN': 7,
    
    # South Asia
    'IN': 4.5, 'PK': 4, 'BD': 5, 'NP': 4.75, 'LK': 4.5, 'MV': 4,
    
    # Central Asia and Middle East
    'AE': 3, 'SA': 2, 'IL': 2, 'IR': 2.5, 'IQ': 2, 'OM': 3,
    'KW': 2, 'QA': 2, 'BH': 2, 'JO': 2, 'LB': 2, 'AF': 3.5,
    'AM': 3, 'AZ': 3, 'GE': 3,
    
    # Oceania
    'AU': 9, 'NZ': 11, 'FJ': 11, 'PF': 10, 'MP': 9,
    
    # Africa
    'ZA': 1, 'EG': 1, 'MA': 0, 'DZ': 0, 'TN': 0, 'LY': 1,
    'NG': 0, 'GH': 0, 'KE': 2, 'UG': 2, 'RW': 1, 'CD': 1,
    'GA': 0, 'CM': 0, 'ZW': 1, 'CI': 0, 'SS': 2,  # Added South Sudan
    'RE': 3,  # Added Réunion
    
    # European Islands and Territories
    'IS': -1, 'FO': -1, 'AX': 1,
    
    # Disputed or Special Territories
    'XK': 0  # Kosovo (using CET)
}

#Checks whether there are countries missing in our dictionary that are present in our df
countries_in_df = df_trustpilot['location'].unique()
countries_in_mapping = set(timezone_offsets.keys())
missing_countries = set(countries_in_df) - countries_in_mapping
if len(missing_countries) > 0:
    print("Countries in dataset but not in timezone mapping:")
    print(missing_countries)
    print(f"\nTotal missing countries: {len(missing_countries)}")
else:
    print("All countries in the dataset are covered in our timezone mapping!")

def adjust_to_local_time(df_trustpilot):
    # Initialize new columns
    df_trustpilot['local_hour'] = None
    df_trustpilot['local_date_posted'] = None
    
    # Process non-CET countries
    for country_code, offset in timezone_offsets.items():
        # Get mask for current country
        mask = df_trustpilot['location'] == country_code
        
        # Adjust time for matching rows
        if offset != 0:  # Skip CET countries
            local_datetime = pd.to_datetime(df_trustpilot.loc[mask, 'date_posted']) + pd.Timedelta(hours=offset)
            df_trustpilot.loc[mask, 'local_date_posted'] = local_datetime.dt.date
            df_trustpilot.loc[mask, 'local_hour'] = local_datetime.dt.hour
    
    # Process CET countries (offset = 0)
    cet_mask = df_trustpilot['location'].isin([k for k, v in timezone_offsets.items() if v == 0])
    cet_datetime = pd.to_datetime(df_trustpilot.loc[cet_mask, 'date_posted'])
    df_trustpilot.loc[cet_mask, 'local_date_posted'] = cet_datetime.dt.date
    df_trustpilot.loc[cet_mask, 'local_hour'] = cet_datetime.dt.hour
    
    # Evaluation steps
    def evaluate_timezone_adjustment():
        # 1. Check for missing values
        null_local_hour = df_trustpilot['local_hour'].isnull().sum()
        null_local_date = df_trustpilot['local_date_posted'].isnull().sum()
        
        # 2. Check for unexpected hours (should be 0-23)
        invalid_hours = df_trustpilot[~df_trustpilot['local_hour'].between(0, 23)].shape[0]
        
        # 3. Check date changes
        date_changes = (df_trustpilot['local_date_posted'] != 
                       pd.to_datetime(df_trustpilot['date_posted']).dt.date).sum()
        
        print("\nEvaluation Results:")
        print(f"Missing local hours: {null_local_hour}")
        print(f"Missing local dates: {null_local_date}")
        print(f"Invalid hours (not 0-23): {invalid_hours}")
        print(f"Number of date changes due to timezone adjustment: {date_changes}")
        
        # Sample of significant changes
        if date_changes > 0:
            print("\nSample of records where date changed:")
            date_change_mask = (df_trustpilot['local_date_posted'] != 
                              pd.to_datetime(df_trustpilot['date_posted']).dt.date)
            sample_changes = df_trustpilot[date_change_mask].sample(min(5, date_changes))
            print(sample_changes[['location', 'date_posted', 'local_date_posted', 'local_hour']].to_string())
    
    evaluate_timezone_adjustment()
    
    return df_trustpilot

# Apply the function and overwrite the existing dataframe
df_trustpilot = adjust_to_local_time(df_trustpilot)

df_trustpilot['local_hour'] = df_trustpilot['local_hour'].astype('int')
df_trustpilot['local_date_posted'] = pd.to_datetime(df_trustpilot['local_date_posted'])


All countries in the dataset are covered in our timezone mapping!

Evaluation Results:
Missing local hours: 0
Missing local dates: 0
Invalid hours (not 0-23): 0
Number of date changes due to timezone adjustment: 9807

Sample of records where date changed:
      location         date_posted local_date_posted local_hour
3723        US 2019-08-22 00:11:43        2019-08-21         17
74915       US 2021-12-15 03:12:06        2021-12-14         20
9748        US 2016-06-18 05:40:07        2016-06-17         22
77341       US 2021-12-08 02:59:02        2021-12-07         19
55203       US 2024-03-25 02:44:52        2024-03-24         19


In [56]:
#Create days_between_experience_and_post
df_trustpilot['days_between_experience_and_post'] = (df_trustpilot['local_date_posted'] - df_trustpilot['date_of_experience']).dt.days


# Create day_of_week_posted by extracting day of week (Monday=0, Sunday=6)
df_trustpilot['day_of_week_posted'] = df_trustpilot['local_date_posted'].dt.dayofweek

In [57]:
#Creates a new column called 'answered' which returns 1 if an answer to a user review was submitted, 0 if not
df_trustpilot['answered'] = np.where(df_trustpilot['answer'] == '0', 0, 1)

display(df_trustpilot.head())


Unnamed: 0,username,number_reviews,location,rating,text,date_of_experience,date_posted,verification,subject,answer,company,local_hour,local_date_posted,days_between_experience_and_post,day_of_week_posted,answered
0,Rob Crane,2,CA,5,The company rep I worked with made my transact...,2024-06-15,2024-10-23 04:17:44,Redirected,The company rep I worked with made my…,0,Flashbay,21,2024-10-22,129,1,0
1,Pat Anderson,1,US,5,I highly recommend using Flashbay. Immediately...,2024-10-16,2024-10-16 19:34:05,Verified,I highly recommend using Flashbay,0,Flashbay,12,2024-10-16,0,2,0
2,Margarita Orlova,1,CZ,5,I had the pleasure of working with Shelby Gibs...,2024-10-10,2024-10-17 10:27:44,Verified,Great customer service,0,Flashbay,10,2024-10-17,7,3,0
3,Paola Rivas,1,US,5,I had a fantastic experience with Brian Truong...,2024-10-21,2024-10-21 22:38:50,Verified,Outstanding Support and Attentive Service,0,Flashbay,15,2024-10-21,0,0,0
4,Fiona Mckelvey Keenan,3,CA,5,My number-one go-to for computer accessories. ...,2024-07-11,2024-10-23 04:09:05,Not Verified,My number-one go-to for computer…,0,Flashbay,21,2024-10-22,103,1,0


In [58]:
#Create new time/date columns

def create_time_features(df):
    def categorize_hour(hour):
        if 5 < hour <= 8:
            return 'Morning'
        elif 9 <= hour <= 17:
            return 'Business Hours'
        elif 18 <= hour <= 22:
            return 'Evening'
        else:
            return 'Night'
    
    def categorize_day(day):
        if day in [0, 1, 2, 3, 4]:  
            return 'Business Day'
        else:
            return 'Weekend'
    
    df['time_of_day'] = df['local_hour'].apply(categorize_hour)
    df['day_type'] = df['day_of_week_posted'].apply(categorize_day)
    df['month_name'] = df['local_date_posted'].dt.strftime('%B')
    
    df['review_delay'] = (pd.to_datetime(df['local_date_posted']) - 
                         pd.to_datetime(df['date_of_experience'])).dt.days
    df['quick_review'] = (df['review_delay'] <= 7).astype(int)
    
    return df

df_trustpilot = create_time_features(df_trustpilot)


print("Text columns distributions:")
text_columns = ['time_of_day', 'day_type', 'month_name']
for col in text_columns:
    print(f"\n{col} value counts:")
    print(df_trustpilot[col].value_counts())
print("\nNumeric columns statistics:")
numeric_columns = ['review_delay', 'quick_review']
print(df_trustpilot[numeric_columns].describe())
print("\nChecking for null values in new columns:")
all_new_columns = text_columns + numeric_columns
print(df_trustpilot[all_new_columns].isnull().sum())

Text columns distributions:

time_of_day value counts:
time_of_day
Business Hours    45370
Morning           14897
Evening           13491
Night              4646
Name: count, dtype: int64

day_type value counts:
day_type
Business Day    63952
Weekend         14452
Name: count, dtype: int64

month_name value counts:
month_name
December     10724
April         8322
May           8114
March         8030
January       7676
February      7486
June          5690
July          4782
October       4758
August        4754
September     4063
November      4005
Name: count, dtype: int64

Numeric columns statistics:
       review_delay  quick_review
count  78404.000000  78404.000000
mean       3.394266      0.943051
std       37.231970      0.231746
min       -1.000000      0.000000
25%        0.000000      1.000000
50%        0.000000      1.000000
75%        0.000000      1.000000
max     3972.000000      1.000000

Checking for null values in new columns:
time_of_day     0
day_type        0
mont

In [59]:
#Creates a column called repeat_reviewer from number_reviews indicating whether the user is a repeat (>=2) or one-time reviewer (<2)

total_users = len(df_trustpilot)
repeat = (df_trustpilot['number_reviews'] >= 2).sum()
pct_repeat = (repeat / total_users) * 100

onetime = (df_trustpilot['number_reviews'] < 2).sum()
pct_onetime = (onetime / total_users) * 100

print("Percentage of users with >= 2 reviews: ", pct_repeat)
print("Percentage of users with < 2 reviews: ", pct_onetime)

#Creating the column
df_trustpilot['repeat_reviewer'] = np.where(df_trustpilot['number_reviews'] >= 2, 'repeat', 'one-time')


Percentage of users with >= 2 reviews:  43.86893525840519
Percentage of users with < 2 reviews:  56.13106474159481


In [60]:
display(df_trustpilot.head())

Unnamed: 0,username,number_reviews,location,rating,text,date_of_experience,date_posted,verification,subject,answer,company,local_hour,local_date_posted,days_between_experience_and_post,day_of_week_posted,answered,time_of_day,day_type,month_name,review_delay,quick_review,repeat_reviewer
0,Rob Crane,2,CA,5,The company rep I worked with made my transact...,2024-06-15,2024-10-23 04:17:44,Redirected,The company rep I worked with made my…,0,Flashbay,21,2024-10-22,129,1,0,Evening,Business Day,October,129,0,repeat
1,Pat Anderson,1,US,5,I highly recommend using Flashbay. Immediately...,2024-10-16,2024-10-16 19:34:05,Verified,I highly recommend using Flashbay,0,Flashbay,12,2024-10-16,0,2,0,Business Hours,Business Day,October,0,1,one-time
2,Margarita Orlova,1,CZ,5,I had the pleasure of working with Shelby Gibs...,2024-10-10,2024-10-17 10:27:44,Verified,Great customer service,0,Flashbay,10,2024-10-17,7,3,0,Business Hours,Business Day,October,7,1,one-time
3,Paola Rivas,1,US,5,I had a fantastic experience with Brian Truong...,2024-10-21,2024-10-21 22:38:50,Verified,Outstanding Support and Attentive Service,0,Flashbay,15,2024-10-21,0,0,0,Business Hours,Business Day,October,0,1,one-time
4,Fiona Mckelvey Keenan,3,CA,5,My number-one go-to for computer accessories. ...,2024-07-11,2024-10-23 04:09:05,Not Verified,My number-one go-to for computer…,0,Flashbay,21,2024-10-22,103,1,0,Evening,Business Day,October,103,0,repeat


In [61]:
df_trustpilot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78404 entries, 0 to 78403
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   username                          78401 non-null  object        
 1   number_reviews                    78404 non-null  int64         
 2   location                          78404 non-null  object        
 3   rating                            78404 non-null  int64         
 4   text                              78404 non-null  object        
 5   date_of_experience                78404 non-null  datetime64[ns]
 6   date_posted                       78404 non-null  datetime64[ns]
 7   verification                      78404 non-null  object        
 8   subject                           78404 non-null  object        
 9   answer                            78404 non-null  object        
 10  company                           78404 non-nu

STAGE 2 - TRANSFORM CATEGORICAL COLUMNS TO NUMERIC

In [62]:
#Encoding verification column
print(df_trustpilot['verification'].unique())

# Verification currently has 4 values. In order to simplify this variable, we will perform binary encoding.
# ['Verified' 'Not Verified' 'Redirected' 'Invited']
# 'Verified' = 1
# 'Not Verified', 'Redirected', 'Invited' = 0 
# 'Not Verified', 'Redirected', 'Invited' were all encoded 0 as they are not verified by the Trustpilot website

verification_map = {
    'Verified': 1,
    'Not Verified': 0,
    'Redirected': 0,
    'Invited': 0
}

df_trustpilot['verification_encoded'] = df_trustpilot['verification'].map(verification_map)

df_trustpilot = df_trustpilot.drop('verification', axis=1)

df_trustpilot['verification'] = df_trustpilot['verification_encoded']
df_trustpilot = df_trustpilot.drop('verification_encoded', axis=1)

['Redirected' 'Verified' 'Not Verified' 'Invited']


In [63]:
#Encoding day_type column

print(df_trustpilot['day_type'].unique())
day_type_map = {
    'Business Day': 1,
    'Weekend': 0
}
df_trustpilot['day_type_encoded'] = df_trustpilot['day_type'].map(day_type_map)
df_trustpilot = df_trustpilot.drop('day_type', axis=1)
df_trustpilot['day_type'] = df_trustpilot['day_type_encoded']
df_trustpilot = df_trustpilot.drop('day_type_encoded', axis=1)

['Business Day' 'Weekend']


In [64]:
#Encoding time_of_day column
print(df_trustpilot['time_of_day'].unique())
time_of_day_map = {
    'Business Hours': 3,  
    'Evening': 2,        
    'Morning': 1,        
    'Night': 0         
}
df_trustpilot['time_of_day_encoded'] = df_trustpilot['time_of_day'].map(time_of_day_map)
df_trustpilot = df_trustpilot.drop('time_of_day', axis=1)
df_trustpilot['time_of_day'] = df_trustpilot['time_of_day_encoded']
df_trustpilot = df_trustpilot.drop('time_of_day_encoded', axis=1)



['Evening' 'Business Hours' 'Morning' 'Night']


In [65]:
#Encoding month_name

print(df_trustpilot['month_name'].unique())
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_trustpilot['month_name_encoded'] = df_trustpilot['month_name'].map(month_map)
df_trustpilot = df_trustpilot.drop('month_name', axis=1)
df_trustpilot['month'] = df_trustpilot['month_name_encoded']
df_trustpilot = df_trustpilot.drop('month_name_encoded', axis=1)


['October' 'September' 'August' 'July' 'June' 'May' 'April' 'March'
 'February' 'January' 'December' 'November']


In [66]:
#Encoding repeat_reviewer
#0 = One-time reviewer 
#1 = Repeat reviewer

le = LabelEncoder()

df_trustpilot['repeat_reviewer_encoded'] = le.fit_transform(df_trustpilot['repeat_reviewer'])
df_trustpilot = df_trustpilot.drop('repeat_reviewer', axis=1)


In [67]:
#Last check
print(df_trustpilot.columns)
print(df_trustpilot.info())
display(df_trustpilot.head())

Index(['username', 'number_reviews', 'location', 'rating', 'text',
       'date_of_experience', 'date_posted', 'subject', 'answer', 'company',
       'local_hour', 'local_date_posted', 'days_between_experience_and_post',
       'day_of_week_posted', 'answered', 'review_delay', 'quick_review',
       'verification', 'day_type', 'time_of_day', 'month',
       'repeat_reviewer_encoded'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78404 entries, 0 to 78403
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   username                          78401 non-null  object        
 1   number_reviews                    78404 non-null  int64         
 2   location                          78404 non-null  object        
 3   rating                            78404 non-null  int64         
 4   text                              78404 non-null  object

Unnamed: 0,username,number_reviews,location,rating,text,date_of_experience,date_posted,subject,answer,company,local_hour,local_date_posted,days_between_experience_and_post,day_of_week_posted,answered,review_delay,quick_review,verification,day_type,time_of_day,month,repeat_reviewer_encoded
0,Rob Crane,2,CA,5,The company rep I worked with made my transact...,2024-06-15,2024-10-23 04:17:44,The company rep I worked with made my…,0,Flashbay,21,2024-10-22,129,1,0,129,0,0,1,2,10,1
1,Pat Anderson,1,US,5,I highly recommend using Flashbay. Immediately...,2024-10-16,2024-10-16 19:34:05,I highly recommend using Flashbay,0,Flashbay,12,2024-10-16,0,2,0,0,1,1,1,3,10,0
2,Margarita Orlova,1,CZ,5,I had the pleasure of working with Shelby Gibs...,2024-10-10,2024-10-17 10:27:44,Great customer service,0,Flashbay,10,2024-10-17,7,3,0,7,1,1,1,3,10,0
3,Paola Rivas,1,US,5,I had a fantastic experience with Brian Truong...,2024-10-21,2024-10-21 22:38:50,Outstanding Support and Attentive Service,0,Flashbay,15,2024-10-21,0,0,0,0,1,1,1,3,10,0
4,Fiona Mckelvey Keenan,3,CA,5,My number-one go-to for computer accessories. ...,2024-07-11,2024-10-23 04:09:05,My number-one go-to for computer…,0,Flashbay,21,2024-10-22,103,1,0,103,0,0,1,2,10,1


STAGE 3 - BASIC TEXT PREPROCESSING

In [68]:
import os

os.makedirs('data', exist_ok=True)

csv_path = 'data/data_trustpilot.csv'
df_trustpilot.to_csv(csv_path, index=False)
print(f"DataFrame exported to {csv_path}")

DataFrame exported to data/data_trustpilot.csv
