In [6]:
import pandas as pd
from datetime import datetime
import re
import hashlib
import sqlite3
import logging

In [7]:
logging.basicConfig(level=logging.INFO)

In [8]:
# fix column names
def tidy_columns(df, mapping=None):
    df.columns = [x.lower().strip().replace(' ', '_') for x in df.columns]
    if mapping:
        df = df.rename(columns=mapping)
    return df

In [9]:
# fix DOB
def clean_dob(value, lim_year=25):
    if pd.isna(value):
        return None
    else:
        day, month, year = map(int, value.strip().split('/'))
        if year >= lim_year:
            year += 1900
        else:
            year += 2000
        return datetime(year, month, day)

In [10]:
def infer_dob(date, age):
    formats = ["%d/%m/%y", "%y-%m-%d"]
    for fmt in formats:
        try:
            dob = datetime.strptime(date.strip(), fmt)
            dob = dob.replace(year=datetime.now().year - int(age))
            return dob.date()
        except:
            continue
    return None

In [11]:
def load_user_data(filepath, encoding='utf-8'):
    logging.info(f"Loading data from {filepath}")
    df = pd.read_csv(filepath, encoding=encoding)
    return df

In [12]:
def hash_password(pw, encoding='utf-8'):
    if pd.isna(pw):
        return None
    else:
        return hashlib.sha256(pw.encode(encoding)).hexdigest()

In [None]:
def clean_gender(value, mapping=None):
    if pd.isna(value):
        return None
    elif mapping is None:
        return value
    else:
        try:
            new_value = mapping[value]
            return new_value
        except KeyError as e:
            logging.warning(f"Unknown gender value: {value}")
            return None
        except Exception as e:
            logging.error(f"Error cleaning gender value: {value}, {e}")
        return None

In [37]:
def add_education_column(df, mapping):
    if mapping is None:
        logging.warning("No mapping provided for education column.")
    elif 'education' in df.columns:
        logging.info(f"Add RFQ column")
        df['rqf'] = df['education'].apply(lambda x: mapping.get(str(x), None))
    elif 'rqf' in df.columns:
        df['education'] = df['rqf'].apply(lambda x: mapping.get(str(x), None))
    else:
        logging.warning("No education or RFQ column found in the DataFrame.")
    return df

In [14]:
def clean_salary(value, period=1):
    if pd.isna(value):
        return None
    else:
        salary = round(int(re.sub(r"[^\d]", '', value))/100, 2)* period
        if salary < 0:
            logging.warning(f"Negative salary found: {value}")
        return salary

In [23]:
def clean_column(value):
    exclusions = ['BLANK', 'NA', 'NONE', '-', '{NULL}', 'VIDE', '']
    if isinstance(value, str) and value.strip().upper() in exclusions:
        return None
    if isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [None]:
def transform_users(df, country_code, column_mapping=None, gender_mapping=None, education_mapping=None, payment_period=1, currency='GBP'):
    logging.info("Transforming user data...")
    logging.info(f"Tidying columns")
    df = tidy_columns(df, column_mapping)
    logging.info(f"Cleaning DOB column")
    df['dob'] = df.apply(lambda row: infer_dob(row['dob'], row['age_last_birthday']), axis=1)
    logging.info(f"Hashing password")
    df['password'] = df['password'].apply(hash_password)
    logging.info(f"Cleaning string columns")
    for col in df.columns:
        if col not in ['password', 'dob']:
            df[col] = df[col].apply(clean_column)
    logging.info(f"Cleaning gender column")
    df['gender'] = df['gender'].apply(lambda row: clean_gender(row, gender_mapping))
    logging.info(f"Checking education column")
    df = add_education_column(df, education_mapping)
    logging.info(f"Cleaning salary column")
    df['salary'] = df['salary'].apply(lambda x: clean_salary(x, payment_period))
    logging.info(f"Setting currency")
    df['currency'] = currency
    logging.info(f"Adding country code")
    df['country_code'] = country_code
    return df

In [17]:
def load_login_data(filepath, timezone):
    logging.info(f"Loading login data from {filepath}")
    df = pd.read_csv(filepath)
    df.columns = ['login_id', 'username', 'login_timestamp']
    logging.info(f"Tidying columns")
    tidy_columns(df)
    df.drop(columns=['login_id'], inplace=True)
    # convert timestamp to datetime
    df['login_timestamp'] = pd.to_datetime(df['login_timestamp'], unit='s', utc=False)
    df['login_timestamp'] = df['login_timestamp'].dt.tz_localize(timezone).dt.tz_convert('UTC')
    return df

In [33]:
uk_education_mapping = {
    '1': 'Vocational Qualification Level 1',
    '2': 'Vocational Qualification Level 2',
    '3': 'A Level',
    '4': 'Higher National Certificate',
    '5': 'Higher National Diploma',
    '6': 'Bachelor’s Degree',
    '7': 'Master’s Degree',
    '8': 'Doctorate Degree'
}

In [None]:
users_uk = load_user_data('data/UK User Data.csv',
                          encoding='latin1')
users_uk = transform_users(users_uk, 'UK',
                           education_mapping=uk_education_mapping,
                           currency='GBP')
logins_uk = load_login_data('data/UK-User-LoginTS.csv', 'Europe/London')
display(users_uk.head(10), logins_uk.head())

INFO:root:Loading data from data/UK User Data.csv
INFO:root:Transforming user data...
INFO:root:Tidying columns
INFO:root:Cleaning DOB column
INFO:root:Hashing password
INFO:root:Cleaning string columns
INFO:root:Cleaning gender column
INFO:root:Checking education column
INFO:root:Cleaning salary column
INFO:root:Adding country code
INFO:root:Loading login data from data/UK-User-LoginTS.csv
INFO:root:Tidying columns


Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,...,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days,education,country_code
0,Derek,Card,A,1965-01-07,60,Red,Elephant,Bangers and Mash,Male,5e30d824b17bd930b9280c126a717d59ccdb4cd05aa8ee...,...,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3.0,19500.0,7,A Level,UK
1,David,Button,,2000-08-22,25,Green,Giraffe,Cottage Pie,Male,22aa055adf8caa10b761514ffed59044adbc14a363c34c...,...,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4.0,21000.0,15,Higher National Certificate,UK
2,Ian,Smythe,JO,1925-01-03,100,Blue,Cat,Toad in the Hole,,1d82e587a6c6a44b1833e2a1ce7460a1ae0b74ca24afc5...,...,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5.0,23000.0,28,Higher National Diploma,UK
3,Samantha,Jones,D,1992-03-24,33,Indigo,Wolf,Roast,Female,3bedb97c70c5ae128ef084645556bfbcf4572dde3e028d...,...,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6.0,32500.0,34,Bachelor’s Degree,UK
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,59700b2f9a7569c7a4e3862b29e4b04806714c79acaabf...,...,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,,,3,,UK
5,Jude,Thomas,,1952-10-06,73,Black,Badger,Curry,Male,0cf67c5ec09b4211deea15515beea2485d96e2d80a1566...,...,Suffolk,IP1 2DA,thomasold@gmail.com,01473 712233,07570 282737,2.0,11541.9,16,Vocational Qualification Level 2,UK
6,Blake,Abney-James,,2009-10-02,16,Teal,Goose,Pizza,Female,bac8c6138fac3aade4b2ed077a25a7fb73856d3e99f49c...,...,Hampshire,SP10 2EA,abneyallseeing@outlook.com,01264 338733,07812 132687,2.0,1331.2,22,Vocational Qualification Level 2,UK
7,Indigo,Pearce,Y,1955-07-25,70,Grey,Crab,Curry,Non-binary,9fca73975f6e7db416bde669f4360a35647b918db570ec...,...,Clwyd,LL18 1AS,junk@icloud.com,01745 344567,03301 623763,6.0,33000.0,42,Bachelor’s Degree,UK
8,Rowan,Weaver,,1974-08-25,51,Cyan,Cow,Crumpets,,be1cd42a7a307da7fdc0f01eadfc7384edbcdbfcae8128...,...,Wiltshire,BA12 9BT,myotheraddress@gmail.com,01985 068271,07305 268271,7.0,41275.0,52,Master’s Degree,UK
9,Jordan,Mayfield,,1976-11-14,49,Violet,Beaver,Pie and Chips,Prefer not to answer,397f623fe2e928e1a455e6ae2985ad4082824a9d1b7908...,...,Devon,PL20 6DT,mayfield_all@gmail.com,01822 618440,07903 438339,8.0,52370.0,29,Doctorate Degree,UK


Unnamed: 0,username,login_timestamp
0,card49a@gmail.com,2025-01-05 10:12:40+00:00
1,card49a@gmail.com,2025-01-09 20:39:23+00:00
2,card49a@gmail.com,2025-01-14 06:52:53+00:00
3,card49a@gmail.com,2025-01-18 17:10:01+00:00
4,card49a@gmail.com,2025-01-23 03:28:32+00:00


In [39]:
french_to_english_columns = {
    'prénom': 'first_name',
    'nom_de_famille': 'surname',
    'ddn': 'dob',
    'âge_dernier_anniversaire': 'age_last_birthday',
    'couleur_préférée': 'favourite_colour',
    'animal_préféré': 'favourite_animal',
    'plat_préféré': 'favourite_food',
    'genre': 'gender',
    'mot_de_passe': 'password',
    'ville': 'city',
    'département': 'county',
    'code_postal': 'postcode',
    'adresse_électronique': 'email',
    'téléphone': 'phone',
    'portable': 'mobile',
    'bac+': 'education',
    'salaire': 'salary',
    'visites_du_site_web_au_cours_des_30_derniers_jours': 'website_visits_last_30_days'
}

In [40]:
french_to_uk_gender = {
    'M': 'Male',
    'F': 'Female',
    'NB': 'Non-Binary'
}

In [45]:
french_to_uk_education = {
    'Collège': '1',
    'Lycée': '2',
    'Baccalauréat': '3',
    'CFA': '5',
    'Licentiate': '6',
    'Master': '7',
    'Doctorat': '8'
}

In [None]:
users_fr = load_user_data('data/FR User Data.csv')
users_fr = transform_users(users_fr, country_code='FR',
                           column_mapping=french_to_english_columns,
                           gender_mapping=french_to_uk_gender,
                           education_mapping=french_to_uk_education,
                           payment_period=12,
                           currency='EUR')
logins_fr = load_login_data('data/FR-User-LoginTS.csv', 'Europe/Paris')
display(users_fr.head(10), logins_fr.head())

INFO:root:Loading data from data/FR User Data.csv
INFO:root:Transforming user data...
INFO:root:Tidying columns
INFO:root:Cleaning DOB column
INFO:root:Hashing password
INFO:root:Cleaning string columns
INFO:root:Cleaning gender column
INFO:root:Checking education column
INFO:root:Add RFQ column
INFO:root:Cleaning salary column
INFO:root:Adding country code
INFO:root:Loading login data from data/FR-User-LoginTS.csv
INFO:root:Tidying columns


Unnamed: 0,first_name,surname,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,education,salary,website_visits_last_30_days,rqf,country_code
0,Adèle Françoise,Bisset,1917-10-01,108,Jaune,Tigre,Ratatouille,Female,f87450b4271d4280d2495e8ad81adeaf22ae1ab0e035f4...,Villevenard,Marne,51270,bisset16@live.com,03 26 80 52 40,06 11 53 00 93,Baccalauréat,18972.0,17,3,FR
1,Adrien Jacques,Abadie,1986-05-05,39,Bleu,Cheval,Cassoulet,Male,29ac55ddddedc7829a09ceddbe99fdb534c44836fe3e7e...,Lille,Nord,59800,ajabadie@outlook.com,03 20 15 84 40,06 81 43 00 10,Licentiate,35754.0,25,6,FR
2,Bruno Jean-Baptiste,Chevrolet,1931-06-26,94,Gris,Mouton,Quiche lorraine,Male,743cc90acc07fd2a7b571f5453c88a93e279378e9a641c...,Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,05 62 34 32 36,06 88 76 27 26,Baccalauréat,12696.0,29,3,FR
3,Cassandre,Fortier,2003-03-01,22,Marron,Poule,Crêpes,,efdd573868230c90e1f4264376446cfc807b19ad6a52ad...,Béziers,Hérault,34500,fortier02@webmail.free.fr,04 67 36 73 73,06 77 70 77 03,Master,45426.0,44,7,FR
4,Ugène,Gagnon,1948-01-05,77,Rouge,Cochon,Bouillabaisse,,b7c87e5534702352e89ea25f624c0e4f38b5f67c4167bb...,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,01 83 75 56 56,06 01 00 00 69,CFA,18972.0,7,5,FR
5,José-Maria,Lamar,2012-10-13,13,Vert,Lapin,Chocolate soufflé,Female,52272488038fadd1ebb7d5e1b46315e398e7a6b3dd3f2d...,Poitiers,Vienne,86000,lapinfou67@list-manage.com,05 49 88 12 34,06 95 83 13 62,Collège,,14,1,FR
6,Sacha,Martel,2009-11-02,16,Rose,Poisson Rouge,Tarte Tatin,Non-Binary,eaf1d347a607004258feee95956e3c93e38c660dbb1e68...,Vannes,Morbihan,56000,sachalepoisson@live.com,02 97 54 34 34,07 88 15 75 58,Lycée,1500.0,32,2,FR
7,Elvire Françoise,Sartre,1963-02-11,62,Noir,Souris,Croque monsieur,Female,a5aca72150e33e9ea96b7fcc1c62a0eee7c2b116644193...,Nevers,Nièvre,58000,noirsartre@outlook.com,03 86 36 15 15,07 89 63 13 57,Doctorat,57600.0,24,8,FR
8,Émile Jean,Travers,1993-02-09,32,Argent,Cerf,Coq au vin,Male,7999f9f14926157e2670072974cfd242a47c0651b5ecb5...,Arras,Pas-de-Calais,62000,travers93@live.com,03 21 23 69 69,06 61 51 90 25,Baccalauréat,23100.0,56,3,FR
9,Capucine,Verne,1978-05-15,47,Pourpre,Loup,Boeuf Bourguignon,Non-Binary,86d89abfc56b7d27018325f993f4d219bd943ab7548ba4...,Nice,Alpes-Maritimes,6400,verne77@webmail.free.fr,04 93 68 11 49,06 10 82 11 71,Licentiate,36300.0,37,6,FR


Unnamed: 0,username,login_timestamp
0,bisset16@live.com,2025-01-02 18:49:24+00:00
1,bisset16@live.com,2025-01-04 14:33:35+00:00
2,bisset16@live.com,2025-01-06 10:24:58+00:00
3,bisset16@live.com,2025-01-08 06:09:43+00:00
4,bisset16@live.com,2025-01-10 01:52:29+00:00


In [47]:
!sqlite3 customers.db < create_database.sql

In [48]:
conn = sqlite3.connect("customers.db")

In [49]:
def update_users_table(users_df, conn):
    try:
        new_count = users_df.to_sql('users', conn, if_exists='append', index=False)
        logging.info(f"Inserted {new_count} new records into users table")
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        conn.rollback()
    finally:
        conn.commit()
    return

In [51]:
update_users_table(users_uk, conn)
update_users_table(users_fr, conn)

ERROR:root:Unexpected error: UNIQUE constraint failed: users.email
ERROR:root:Unexpected error: UNIQUE constraint failed: users.email


In [52]:
def update_login_table(logins_df, conn):
    try:
        sql_str = """
        SELECT DISTINCT
        user_id
        , email
        FROM users
        order by user_id
        """
        key_lkp = pd.read_sql(sql_str, conn)
        logins_df_lkp = logins_df.merge(key_lkp, left_on='username', right_on='email', how='inner')
        logins_df_lkp = logins_df_lkp[['user_id', 'login_timestamp']]
        new_count = logins_df_lkp.to_sql('logins', conn, if_exists='append', index=False)
        logging.info(f"Inserted {new_count} new records into logins table")
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        conn.rollback()
    finally:
        conn.commit()
    return

In [53]:
update_login_table(logins_uk, conn)
update_login_table(logins_fr, conn)

INFO:root:Inserted 248 new records into logins table
INFO:root:Inserted 285 new records into logins table


In [126]:
conn.close()

In [9]:
!jupyter nbconvert --no-prompt --to script explorer.ipynb

[NbConvertApp] Converting notebook explorer.ipynb to script
[NbConvertApp] Writing 2463 bytes to explorer.py
