In [22]:
import pandas as pd
from datetime import datetime, timezone
import re
import hashlib

In [2]:
# fix column names
def lower_columns(df):
    df.columns = [x.lower().strip().replace(' ', '_') for x in df.columns]
    return

In [3]:
user = pd.read_csv('data/UK User Data.csv',
                   encoding='latin1')
lower_columns(user)
user['dob'] = pd.to_datetime(user['dob'], format='%d/%m/%y')
print(user.dtypes)
user.head(10)

first_name                             object
surname                                object
middle_initials                        object
dob                            datetime64[ns]
age_last_birthday                       int64
favourite_colour                       object
favourite_animal                       object
favourite_food                         object
gender                                 object
password                               object
city                                   object
county                                 object
postcode                               object
email                                  object
phone                                  object
mobile                                 object
rqf                                    object
salary                                 object
website_visits_last_30_days             int64
dtype: object


Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,2065-01-07,60,Red,Elephant,Bangers and Mash,Male,Parishaggis17%,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3,"19,500.00",7
1,David,Button,none,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,CarTrain 56$,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4,"21,000.00",15
2,Ian,Smythe,JO,2025-01-03,100,Blue,Cat,Toad in the Hole,blank,1945Tank*,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5,"23,000.00",28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,Yorkshire!3Pig,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6,"32,500.00",34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,Snoopy78Peanut!,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,-,na,3
5,Jude,Thomas,,2051-10-06,73,Black,Badger,Curry,Male,Manage=7This,Ipswich,Suffolk,IP1 2DA,thomasold@gmail.com,01473 712233,07570 282737,2,"11,541.90",16
6,Blake,Abney-James,,2008-10-02,16,Teal,Goose,Pizza,Female,All-spice77,Andover,Hampshire,SP10 2EA,abneyallseeing@outlook.com,01264 338733,07812 132687,2,"1,331.20",22
7,Indigo,Pearce,Y,2054-07-25,70,Grey,Crab,Curry,Non-binary,Geneva(banana)9,Rhyl,Clwyd,LL18 1AS,junk@icloud.com,01745 344567,03301 623763,6,"33,000.00",42
8,Rowan,Weaver,-,1973-08-25,51,Cyan,Cow,Crumpets,-,Yellow8Gold%,Warminster,Wiltshire,BA12 9BT,myotheraddress@gmail.com,01985 068271,07305 268271,7,"41,275.00",52
9,Jordan,Mayfield,{NULL},1975-11-14,49,Violet,Beaver,Pie and Chips,Prefer not to answer,TakeThat2000!,Yelverton,Devon,PL20 6DT,mayfield_all@gmail.com,01822 618440,07903 438339,8,"52,370.00",29


In [4]:
# fix DOB
def clean_dob(value, age):
    if value > datetime.now():
        year_new = datetime.now().year - age
        value = value.replace(year=year_new)
    return value

In [5]:
# encrypt passwords
def hash_password(pw):
    if pd.isna(pw):
        return None
    else:
        return hashlib.sha256(pw.encode('utf-8')).hexdigest()

In [6]:
# fix salary column
def clean_salary(value):
    if pd.isna(value):
        return None
    else:
        return re.sub(r"[^\d.]", '', value)

In [7]:
# clean middle initial
# clean gender
# clean rfq
def clean_column(value):
    if isinstance(value, str) and value.strip().upper() in ['BLANK', 'NA', 'NONE', '-', '{NULL}', '']:
        return None
    if isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [8]:
# clean columns
for col in user.columns:
    user[col] = user[col].apply(clean_column)
# fix dob
for i, row in user.iterrows():
    user.loc[i, 'dob'] = clean_dob(row['dob'], row['age_last_birthday'])
# hash password
user['password'] = user['password'].apply(hash_password)
# clean salary
user['salary'] = user['salary'].apply(clean_salary)
user.head()

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,1965-01-07,60,Red,Elephant,Bangers and Mash,Male,5e30d824b17bd930b9280c126a717d59ccdb4cd05aa8ee...,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3.0,19500.0,7
1,David,Button,,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,22aa055adf8caa10b761514ffed59044adbc14a363c34c...,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4.0,21000.0,15
2,Ian,Smythe,JO,2025-01-03,100,Blue,Cat,Toad in the Hole,,1d82e587a6c6a44b1833e2a1ce7460a1ae0b74ca24afc5...,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5.0,23000.0,28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,3bedb97c70c5ae128ef084645556bfbcf4572dde3e028d...,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6.0,32500.0,34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,59700b2f9a7569c7a4e3862b29e4b04806714c79acaabf...,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,,,3


In [None]:
# standardise logints
def clean_timestamp(value):
    return datetime.fromtimestamp(value, tz=timezone.utc).strftime("%Y-%m-%d")

In [None]:
logins = pd.read_csv('data/UK-User-LoginTS.csv')
# change column names
logins.columns = ['login_id', 'username', 'login_timestamp']
logins['login_timestamp'] = pd.to_datetime(logins['login_timestamp'].apply(clean_timestamp))
logins.head()

Unnamed: 0,login_id,username,login_timestamp
0,1,card49a@gmail.com,1736071960
1,2,card49a@gmail.com,1736455163
2,3,card49a@gmail.com,1736837573
3,4,card49a@gmail.com,1737220201
4,5,card49a@gmail.com,1737602912


In [17]:
# check for any missing values
logins.isna().any()

login_id           False
username           False
login_timestamp    False
dtype: bool

In [19]:
# quick column counter
for col in logins.columns:
    print(logins[col].value_counts().head(), "\n")

login_id
1    1
2    1
3    1
4    1
5    1
Name: count, dtype: int64 

username
myotheraddress@gmail.com      52
junk@icloud.com               42
busybusy@yahoo.com            34
mayfield_all@gmail.com        29
long.65.morning@icloud.com    28
Name: count, dtype: int64 

login_timestamp
1736071960    1
1736455163    1
1736837573    1
1737220201    1
1737602912    1
Name: count, dtype: int64 

