In [11]:
import pandas as pd
from datetime import datetime, timezone
import re
import hashlib
import sqlite3
import logging

In [12]:
logging.basicConfig(level=logging.INFO)

In [1]:
# fix column names
def lower_columns(df):
    new_columns = [x.lower().strip().replace(' ', '_') for x in df.columns]
    df.columns = new_columns
    return new_columns

In [3]:
# fix DOB
def clean_dob(value, lim_year=25):
    if pd.isna(value):
        return None
    else:
        day, month, year = map(int, value.strip().split('/'))
        if year >= lim_year:
            year += 1900
        else:
            year += 2000
        return datetime(year, month, day)

In [None]:
def load_data(filepath, encoding='utf-8'):
    logging.info(f"Loading data from {filepath}")
    df = pd.read_csv(filepath, encoding=encoding)
    return lower_columns(df)

In [4]:
users = pd.read_csv('data/UK User Data.csv',
                    encoding='latin1')
lower_columns(users)
users['dob'] = pd.to_datetime(users['dob'].apply(clean_dob))
print(users.dtypes)
users.head(10)

first_name                             object
surname                                object
middle_initials                        object
dob                            datetime64[ns]
age_last_birthday                       int64
favourite_colour                       object
favourite_animal                       object
favourite_food                         object
gender                                 object
password                               object
city                                   object
county                                 object
postcode                               object
email                                  object
phone                                  object
mobile                                 object
rqf                                    object
salary                                 object
website_visits_last_30_days             int64
dtype: object


Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,1965-01-07,60,Red,Elephant,Bangers and Mash,Male,Parishaggis17%,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3,"19,500.00",7
1,David,Button,none,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,CarTrain 56$,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4,"21,000.00",15
2,Ian,Smythe,JO,1925-01-03,100,Blue,Cat,Toad in the Hole,blank,1945Tank*,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5,"23,000.00",28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,Yorkshire!3Pig,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6,"32,500.00",34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,Snoopy78Peanut!,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,-,na,3
5,Jude,Thomas,,1951-10-06,73,Black,Badger,Curry,Male,Manage=7This,Ipswich,Suffolk,IP1 2DA,thomasold@gmail.com,01473 712233,07570 282737,2,"11,541.90",16
6,Blake,Abney-James,,2008-10-02,16,Teal,Goose,Pizza,Female,All-spice77,Andover,Hampshire,SP10 2EA,abneyallseeing@outlook.com,01264 338733,07812 132687,2,"1,331.20",22
7,Indigo,Pearce,Y,1954-07-25,70,Grey,Crab,Curry,Non-binary,Geneva(banana)9,Rhyl,Clwyd,LL18 1AS,junk@icloud.com,01745 344567,03301 623763,6,"33,000.00",42
8,Rowan,Weaver,-,1973-08-25,51,Cyan,Cow,Crumpets,-,Yellow8Gold%,Warminster,Wiltshire,BA12 9BT,myotheraddress@gmail.com,01985 068271,07305 268271,7,"41,275.00",52
9,Jordan,Mayfield,{NULL},1975-11-14,49,Violet,Beaver,Pie and Chips,Prefer not to answer,TakeThat2000!,Yelverton,Devon,PL20 6DT,mayfield_all@gmail.com,01822 618440,07903 438339,8,"52,370.00",29


In [5]:
# encrypt passwords
def hash_password(pw):
    if pd.isna(pw):
        return None
    else:
        return hashlib.sha256(pw.encode('utf-8')).hexdigest()

In [6]:
# fix salary column
def clean_salary(value):
    if pd.isna(value):
        return None
    else:
        return re.sub(r"[^\d.]", '', value)

In [7]:
# clean middle initial
# clean gender
# clean rfq
def clean_column(value):
    if isinstance(value, str) and value.strip().upper() in ['BLANK', 'NA', 'NONE', '-', '{NULL}', '']:
        return None
    if isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [9]:
# clean columns
for col in users.columns:
    users[col] = users[col].apply(clean_column)
# hash password
users['password'] = users['password'].apply(hash_password)
# clean salary
users['salary'] = users['salary'].apply(clean_salary)
users.head()

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,1965-01-07,60,Red,Elephant,Bangers and Mash,Male,5e30d824b17bd930b9280c126a717d59ccdb4cd05aa8ee...,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3.0,19500.0,7
1,David,Button,,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,22aa055adf8caa10b761514ffed59044adbc14a363c34c...,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4.0,21000.0,15
2,Ian,Smythe,JO,1925-01-03,100,Blue,Cat,Toad in the Hole,,1d82e587a6c6a44b1833e2a1ce7460a1ae0b74ca24afc5...,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5.0,23000.0,28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,3bedb97c70c5ae128ef084645556bfbcf4572dde3e028d...,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6.0,32500.0,34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,59700b2f9a7569c7a4e3862b29e4b04806714c79acaabf...,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,,,3


In [20]:
# standardise logints
def clean_timestamp(value):
    return datetime.fromtimestamp(value, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")

In [21]:
logins = pd.read_csv('data/UK-User-LoginTS.csv')
# change column names
logins.columns = ['login_id', 'username', 'login_timestamp']
logins['login_timestamp'] = pd.to_datetime(logins['login_timestamp'].apply(clean_timestamp))
logins.head()

Unnamed: 0,login_id,username,login_timestamp
0,1,card49a@gmail.com,2025-01-05 10:12:40
1,2,card49a@gmail.com,2025-01-09 20:39:23
2,3,card49a@gmail.com,2025-01-14 06:52:53
3,4,card49a@gmail.com,2025-01-18 17:10:01
4,5,card49a@gmail.com,2025-01-23 03:28:32


In [11]:
# check for any missing values
logins.isna().any()

login_id           False
username           False
login_timestamp    False
dtype: bool

In [12]:
# quick column counter
for col in logins.columns:
    print(logins[col].value_counts().head(), "\n")

login_id
1    1
2    1
3    1
4    1
5    1
Name: count, dtype: int64 

username
myotheraddress@gmail.com      52
junk@icloud.com               42
busybusy@yahoo.com            34
mayfield_all@gmail.com        29
long.65.morning@icloud.com    28
Name: count, dtype: int64 

login_timestamp
2025-01-09    10
2025-01-23    10
2025-01-27     9
2025-01-05     9
2025-01-31     9
Name: count, dtype: int64 



In [22]:
!sqlite3 customers.db < create_database.sql

In [23]:
conn = sqlite3.connect("customers.db")

In [24]:
users.to_sql('users', conn, if_exists='append', index=False)

10

In [25]:
logins.to_sql('logins', conn, if_exists='append', index=False)

248

In [None]:
conn.close()

In [23]:
users_fr = pd.read_csv('data/FR User Data.csv')
print(users_fr.dtypes)
users_fr

Prénom                                                object
Nom de famille                                        object
DdN                                                   object
\nÂge dernier anniversaire                             int64
Couleur préférée                                      object
Animal préféré                                        object
Plat préféré                                          object
Genre                                                 object
Mot de passe                                          object
Ville                                                 object
Département                                           object
Code postal                                            int64
Adresse électronique                                  object
Téléphone                                             object
Portable                                              object
BAC+                                                  object
 Salaire                

Unnamed: 0,Prénom,Nom de famille,DdN,\nÂge dernier anniversaire,Couleur préférée,Animal préféré,Plat préféré,Genre,Mot de passe,Ville,Département,Code postal,Adresse électronique,Téléphone,Portable,BAC+,Salaire,Visites du site Web au cours des 30 derniers jours
0,Adèle Françoise,Bisset,16-10-01,108,Jaune,Tigre,Ratatouille,F,BUXe$E2Y/4+mX!J,Villevenard,Marne,51270,bisset16@live.com,03 26 80 52 40,06 11 53 00 93,Baccalauréat,"1.581,00",17
1,Adrien Jacques,Abadie,85-05-05,39,Bleu,Cheval,Cassoulet,M,t4BPtPe.Nis/EJS,Lille,Nord,59800,ajabadie@outlook.com,03 20 15 84 40,06 81 43 00 10,Licentiate,"2.979,50",25
2,Bruno Jean-Baptiste,Chevrolet,30-06-26,94,Gris,Mouton,Quiche lorraine,M,"68,cj%L4wALVksu",Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,05 62 34 32 36,06 88 76 27 26,Baccalauréat,"1.058,00",29
3,Cassandre,Fortier,02-03-01,22,Marron,Poule,Crêpes,vide,"vXE,E!9dK,cq4_2",Béziers,Hérault,34500,fortier02@webmail.free.fr,04 67 36 73 73,06 77 70 77 03,Master,"3.785,50",44
4,Ugène,Gagnon,48-01-05,77,Rouge,Cochon,Bouillabaisse,-,?de/7C9eJ?SdmsZ,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,01 83 75 56 56,06 01 00 00 69,CFA,"1.581,00",7
5,José-Maria,Lamar,11-10-13,13,Vert,Lapin,Chocolate soufflé,F,F8%cM3?sjQP@JnY,Poitiers,Vienne,86000,lapinfou67@list-manage.com,05 49 88 12 34,06 95 83 13 62,Collège,na,14
6,Sacha,Martel,08-11-02,16,Rose,Poisson Rouge,Tarte Tatin,NB,i9/_yz&3mG4+Za$,Vannes,Morbihan,56000,sachalepoisson@live.com,02 97 54 34 34,07 88 15 75 58,Lycée,12500,32
7,Elvire Françoise,Sartre,63-02-11,62,Noir,Souris,Croque monsieur,F,gSNzDVa?rur2GT5,Nevers,Nièvre,58000,noirsartre@outlook.com,03 86 36 15 15,07 89 63 13 57,Doctorat,"4.800,00",24
8,Émile Jean,Travers,93-02-09,32,Argent,Cerf,Coq au vin,M,"?nEz?@x,C$6wK*@",Arras,Pas-de-Calais,62000,travers93@live.com,03 21 23 69 69,06 61 51 90 25,Baccalauréat,"1.925,00",56
9,Capucine,Verne,77-05-15,47,Pourpre,Loup,Boeuf Bourguignon,NB,&P2D_xH&%dhFdg3,Nice,Alpes-Maritimes,6400,verne77@webmail.free.fr,04 93 68 11 49,06 10 82 11 71,Licentiate,"3.025,00",37


In [9]:
!jupyter nbconvert --no-prompt --to script explorer.ipynb

[NbConvertApp] Converting notebook explorer.ipynb to script
[NbConvertApp] Writing 2463 bytes to explorer.py


In [46]:
conn = sqlite3.connect("finance.db")
sql_str = """
SELECT
  currency
, rate_to_gdp
FROM (
    SELECT *,
           RANK() OVER (ORDER BY timestamp DESC) AS rnk
    FROM currency_exchange
) ranked
WHERE rnk = 1;
"""
curr = pd.read_sql(sql_str, conn)
curr.head()

Unnamed: 0,currency,rate_to_gdp
0,Argentine Peso,1605.141978
1,Australian Dollar,2.081207
2,Bahraini Dinar,0.51051
3,Botswana Pula,18.112975
4,Brazilian Real,7.497063
