# member_pos (MP): Raw Data Discovery & Insights

### Target Attributes: Member_id, Purchase Date

In [None]:
import sqlite3
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns

conn = sqlite3.connect(r"__PATH__")

df = pd.read_sql_query("SELECT * FROM member_pos", conn)

### Data Understanding, Cleaning & Preparation

In [2]:
df.shape

(2634, 13)

In [3]:
df.head()

Unnamed: 0,pos_row_id,member_id,First Name,Last Name,Purchase Date,Item,Category,Quantity,Unit Price,Total Price,Payment Method,Purchase ID,Cashier
0,1,ABC1311,Allen,Gross,2024-06-25,Protein Shake,Beverage,1,$5.00,$5.00,Mobile Pay,POS-01766,John Smith
1,2,ABC1604,Benjamin,Alexander,2024-08-26,Guest Pass,,1,$10.00,$10.00,Credit Card,POS-02500,ALEX KIM
2,3,ABC1103,Joshua,Wilson,2025-06-14,Energy Bar,,2,$2.50,$5.00,Credit Card,POS-01273,John Smith
3,4,ABC1322,James,Diaz,2024-10-05,Guest Pass,Service,1,$10.00,$10.00,Cash,POS-01791,Sarah Lee
4,5,ABC1239,Thomas,Pena,2024-12-27,Energy Bar,Snack,1,$2.50,$2.50,Credit Card,POS-01589,Front Desk


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
df.columns, df.dtypes

(Index(['pos_row_id', 'member_id', 'first_name', 'last_name', 'purchase_date',
        'item', 'category', 'quantity', 'unit_price', 'total_price',
        'payment_method', 'purchase_id', 'cashier'],
       dtype='object'),
 pos_row_id         int64
 member_id         object
 first_name        object
 last_name         object
 purchase_date     object
 item              object
 category          object
 quantity           int64
 unit_price        object
 total_price       object
 payment_method    object
 purchase_id       object
 cashier           object
 dtype: object)

In [6]:
df = df[[#'pos_row_id', 
         'member_id', 
         #'first_name', 'last_name', 
         'purchase_date',
            #'item', 'category', 
            'quantity', 
            #'unit_price', 
            'total_price'
            #'payment_method', 'purchase_id', 'cashier'
            ]].copy()

In [7]:
df.head()

Unnamed: 0,member_id,purchase_date,quantity,total_price
0,ABC1311,2024-06-25,1,$5.00
1,ABC1604,2024-08-26,1,$10.00
2,ABC1103,2025-06-14,2,$5.00
3,ABC1322,2024-10-05,1,$10.00
4,ABC1239,2024-12-27,1,$2.50


In [8]:
df.isna().sum()

member_id        0
purchase_date    0
quantity         0
total_price      0
dtype: int64

In [9]:
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

In [10]:
df[df['member_id'] == 'ABC1000']

Unnamed: 0,member_id,purchase_date,quantity,total_price
49,ABC1000,2024-11-10,1,$5.00
1221,ABC1000,2025-01-11,1,$12.00
2029,ABC1000,2025-02-26,1,$12.00
2172,ABC1000,2025-05-19,1,$5.00


In [11]:
df['total_price'] = df['total_price'].replace('[$]', '', regex=True).astype(float)

In [12]:
total_purchase_value = df.groupby('member_id')['total_price'].sum().reset_index(name='total_purchase_value')

df = df.merge(total_purchase_value, on='member_id', how='left')

df[df['member_id'] == 'ABC1000']

Unnamed: 0,member_id,purchase_date,quantity,total_price,total_purchase_value
49,ABC1000,2024-11-10,1,5.0,34.0
1221,ABC1000,2025-01-11,1,12.0,34.0
2029,ABC1000,2025-02-26,1,12.0,34.0
2172,ABC1000,2025-05-19,1,5.0,34.0


In [13]:
total_items_bought = df.groupby('member_id')['quantity'].sum().reset_index(name='total_items_bought')

df = df.merge(total_items_bought, on='member_id', how='left')

df[df['member_id'] == 'ABC1000']

Unnamed: 0,member_id,purchase_date,quantity,total_price,total_purchase_value,total_items_bought
49,ABC1000,2024-11-10,1,5.0,34.0,4
1221,ABC1000,2025-01-11,1,12.0,34.0,4
2029,ABC1000,2025-02-26,1,12.0,34.0,4
2172,ABC1000,2025-05-19,1,5.0,34.0,4


In [14]:
df['last_purchase_date'] = df.groupby('member_id')['purchase_date'].transform('max')

df[df['member_id'] == 'ABC1000']

Unnamed: 0,member_id,purchase_date,quantity,total_price,total_purchase_value,total_items_bought,last_purchase_date
49,ABC1000,2024-11-10,1,5.0,34.0,4,2025-05-19
1221,ABC1000,2025-01-11,1,12.0,34.0,4,2025-05-19
2029,ABC1000,2025-02-26,1,12.0,34.0,4,2025-05-19
2172,ABC1000,2025-05-19,1,5.0,34.0,4,2025-05-19


In [15]:
member_shop_summary = (
        df[['member_id', 'total_items_bought', 'total_purchase_value', 'last_purchase_date']]
        .drop_duplicates('member_id', keep='first')
)

member_shop_summary[member_shop_summary['member_id'] == 'ABC1000']

Unnamed: 0,member_id,total_items_bought,total_purchase_value,last_purchase_date
49,ABC1000,4,34.0,2025-05-19


In [16]:
member_shop_summary.head(10)

Unnamed: 0,member_id,total_items_bought,total_purchase_value,last_purchase_date
0,ABC1311,1,5.0,2024-06-25
1,ABC1604,8,65.0,2025-05-08
2,ABC1103,3,7.5,2025-06-14
3,ABC1322,1,10.0,2024-10-05
4,ABC1239,9,45.0,2025-08-02
5,ABC1763,6,48.5,2025-03-27
6,ABC1568,5,41.0,2025-06-11
7,ABC1098,7,61.0,2025-07-31
8,ABC1861,4,47.0,2025-07-19
9,ABC1436,3,28.0,2025-04-09


In [17]:
member_shop_summary.shape

(900, 4)

### Load to clean_db

In [None]:
con_out = sqlite3.connect(r"__PATH__")
member_shop_summary.to_sql("member_pos", con_out, if_exists="replace", index=False)
con_out.close()