# GOAL : 
## FIND THE TYPE OF CUSTOMERS WHO PURCHASE CHIPS AND THEIR PURCHASING BEHAVIOUR WITHIN THE REGION

# PART ONE

## Examine Transaction Data
1. Look for inconsistencies, missing data across the dataset, outliers, correctly identified category items, numeric data across all tables.
2. In case of identified anomalies, make necessary changes to the dataset and save it.

In [70]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chardet
from datetime import datetime, timedelta

In [71]:
# encoding used in the file
with open ("QVI_transaction_data.csv", mode='rb') as file:
    raw_bytes = file.read(5)
    detected_encoding = chardet.detect(raw_bytes)['encoding']
    print(detected_encoding)

ascii


In [72]:
# convert from ascii to utf-8
with open("QVI_transaction_data.csv", encoding='utf-8') as file:
    rows = list(csv.reader(file))
    header = rows[0]
    print(rows[:10])

[['DATE', 'STORE_NBR', 'LYLTY_CARD_NBR', 'TXN_ID', 'PROD_NBR', 'PROD_NAME', 'PROD_QTY', 'TOT_SALES'], ['43390', '1', '1000', '1', '5', 'Natural Chip        Compny SeaSalt175g', '2', '6'], ['43599', '1', '1307', '348', '66', 'CCs Nacho Cheese    175g', '3', '6.3'], ['43605', '1', '1343', '383', '61', 'Smiths Crinkle Cut  Chips Chicken 170g', '2', '2.9'], ['43329', '2', '2373', '974', '69', 'Smiths Chip Thinly  S/Cream&Onion 175g', '5', '15'], ['43330', '2', '2426', '1038', '108', 'Kettle Tortilla ChpsHny&Jlpno Chili 150g', '3', '13.8'], ['43604', '4', '4074', '2982', '57', 'Old El Paso Salsa   Dip Tomato Mild 300g', '1', '5.1'], ['43601', '4', '4149', '3333', '16', 'Smiths Crinkle Chips Salt & Vinegar 330g', '1', '5.7'], ['43601', '4', '4196', '3539', '24', 'Grain Waves         Sweet Chilli 210g', '1', '3.6'], ['43332', '5', '5026', '4525', '42', 'Doritos Corn Chip Mexican Jalapeno 150g', '1', '3.9']]


# INSIGHTS

1. **ASCII** encoding is used in the csv file.
2. The Date Column doers not look like a date.
3. Numbers are represented as strings. 
4. Format headers to have the correct spelling and start with uppercase only.
5. Separate quantity from product name 

In [73]:
# explore the dataset
def explore_dataset(dataset,start,end,rows_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print("\n")
    if rows_columns:
        print("There are {} rows".format(len(dataset)))
        print("There are {} columns".format(len(dataset[0])))
    

In [74]:
explore_dataset(rows,0,10,True)

['DATE', 'STORE_NBR', 'LYLTY_CARD_NBR', 'TXN_ID', 'PROD_NBR', 'PROD_NAME', 'PROD_QTY', 'TOT_SALES']


['43390', '1', '1000', '1', '5', 'Natural Chip        Compny SeaSalt175g', '2', '6']


['43599', '1', '1307', '348', '66', 'CCs Nacho Cheese    175g', '3', '6.3']


['43605', '1', '1343', '383', '61', 'Smiths Crinkle Cut  Chips Chicken 170g', '2', '2.9']


['43329', '2', '2373', '974', '69', 'Smiths Chip Thinly  S/Cream&Onion 175g', '5', '15']


['43330', '2', '2426', '1038', '108', 'Kettle Tortilla ChpsHny&Jlpno Chili 150g', '3', '13.8']


['43604', '4', '4074', '2982', '57', 'Old El Paso Salsa   Dip Tomato Mild 300g', '1', '5.1']


['43601', '4', '4149', '3333', '16', 'Smiths Crinkle Chips Salt & Vinegar 330g', '1', '5.7']


['43601', '4', '4196', '3539', '24', 'Grain Waves         Sweet Chilli 210g', '1', '3.6']


['43332', '5', '5026', '4525', '42', 'Doritos Corn Chip Mexican Jalapeno 150g', '1', '3.9']


There are 264837 rows
There are 8 columns


In [75]:
# check if there are empty sublists in our list
rows =  [sublist for sublist in rows if sublist]
explore_dataset(rows,0,5,True)

['DATE', 'STORE_NBR', 'LYLTY_CARD_NBR', 'TXN_ID', 'PROD_NBR', 'PROD_NAME', 'PROD_QTY', 'TOT_SALES']


['43390', '1', '1000', '1', '5', 'Natural Chip        Compny SeaSalt175g', '2', '6']


['43599', '1', '1307', '348', '66', 'CCs Nacho Cheese    175g', '3', '6.3']


['43605', '1', '1343', '383', '61', 'Smiths Crinkle Cut  Chips Chicken 170g', '2', '2.9']


['43329', '2', '2373', '974', '69', 'Smiths Chip Thinly  S/Cream&Onion 175g', '5', '15']


There are 264837 rows
There are 8 columns


In [76]:
# check for duplicates
duplicate_entries = []
unique_entries = set()
rows_with_duplicates = []
for row in rows:
    transaction_id = row[3]
    if transaction_id not in unique_entries:
        unique_entries.add(transaction_id)
    else:
        duplicate_entries.append(transaction_id)
        rows_with_duplicates.append(row)
print("There are {} duplicate entries".format(len(duplicate_entries)))
print("There are {} unique entries".format(len(unique_entries)))
print(rows_with_duplicates[:10])

There are 1709 duplicate entries
There are 263128 unique entries
[['43605', '55', '55073', '48887', '113', 'Twisties Chicken270g', '1', '4.6'], ['43475', '7', '7364', '7739', '20', 'Doritos Cheese      Supreme 330g', '2', '11.4'], ['43391', '12', '12301', '10982', '93', 'Doritos Corn Chip Southern Chicken 150g', '2', '7.8'], ['43351', '16', '16427', '14546', '81', 'Pringles Original   Crisps 134g', '1', '3.7'], ['43315', '19', '19272', '16683', '31', 'Infzns Crn Crnchers Tangy Gcamole 110g', '2', '7.6'], ['43401', '47', '47204', '42616', '45', 'Smiths Thinly Cut   Roast Chicken 175g', '2', '6'], ['43609', '48', '48179', '44177', '56', 'Cheezels Cheese Box 125g', '2', '4.2'], ['43559', '55', '55036', '48663', '91', 'CCs Tasty Cheese    175g', '2', '4.2'], ['43282', '55', '55073', '48884', '91', 'CCs Tasty Cheese    175g', '2', '4.2'], ['43489', '58', '58121', '53351', '42', 'Doritos Corn Chip Mexican Jalapeno 150g', '2', '7.8']]


Out of these duplicate entries, they have similar transaction ids, loyalty card numbers but different products. so we will leave them as they are.

In [77]:
# get the index of every column
col_index = {}
for i in range(len(header)):
    col_index[header[i]] = i
print(col_index)

{'DATE': 0, 'STORE_NBR': 1, 'LYLTY_CARD_NBR': 2, 'TXN_ID': 3, 'PROD_NBR': 4, 'PROD_NAME': 5, 'PROD_QTY': 6, 'TOT_SALES': 7}


In [78]:
def excel_serial_date_to_datetime(serial_date):
    base_date = datetime(1899, 12, 31)
    delta = timedelta(days=float(serial_date))
    resulting_date = base_date + delta
    return resulting_date

def serialdate_to_datetime(rows, idx):
    for i, row in enumerate(rows):
        if i == 0:
            continue  # Skip header if present
        serial_date = row[idx]
        date = excel_serial_date_to_datetime(serial_date)
        row[idx] = date
    return rows

In [79]:
rows = serialdate_to_datetime(rows, date_column_index)
print(rows[:5])

[['DATE', 'STORE_NBR', 'LYLTY_CARD_NBR', 'TXN_ID', 'PROD_NBR', 'PROD_NAME', 'PROD_QTY', 'TOT_SALES'], [datetime.datetime(2018, 10, 18, 0, 0), '1', '1000', '1', '5', 'Natural Chip        Compny SeaSalt175g', '2', '6'], [datetime.datetime(2019, 5, 15, 0, 0), '1', '1307', '348', '66', 'CCs Nacho Cheese    175g', '3', '6.3'], [datetime.datetime(2019, 5, 21, 0, 0), '1', '1343', '383', '61', 'Smiths Crinkle Cut  Chips Chicken 170g', '2', '2.9'], [datetime.datetime(2018, 8, 18, 0, 0), '2', '2373', '974', '69', 'Smiths Chip Thinly  S/Cream&Onion 175g', '5', '15']]


In [80]:
print(rows[:5])

[['DATE', 'STORE_NBR', 'LYLTY_CARD_NBR', 'TXN_ID', 'PROD_NBR', 'PROD_NAME', 'PROD_QTY', 'TOT_SALES'], [datetime.datetime(2018, 10, 18, 0, 0), '1', '1000', '1', '5', 'Natural Chip        Compny SeaSalt175g', '2', '6'], [datetime.datetime(2019, 5, 15, 0, 0), '1', '1307', '348', '66', 'CCs Nacho Cheese    175g', '3', '6.3'], [datetime.datetime(2019, 5, 21, 0, 0), '1', '1343', '383', '61', 'Smiths Crinkle Cut  Chips Chicken 170g', '2', '2.9'], [datetime.datetime(2018, 8, 18, 0, 0), '2', '2373', '974', '69', 'Smiths Chip Thinly  S/Cream&Onion 175g', '5', '15']]


I will write to the csv file later, to keep permanent changes of the date column

In [82]:
# check if the data types match
def check_datatypes(rows,header):    
    # Initialize a dictionary to store the data types for each column
    column_data_types = {col: None for col in header}

    # Iterate over each row of the CSV file
    for row in rows:
        # Iterate over each column in the row
        for i, col_value in enumerate(row):
            # Check if the data type for the column has been set yet
            if not column_data_types[header[i]]:
                # If not, set the data type to the type of the current value
                column_data_types[header[i]] = type(col_value)
            else:
                # If it has been set, check if the current value has a different data type
                if column_data_types[header[i]] != type(col_value):
                    # If it does, set the data type to a generic "object" type
                    column_data_types[header[i]] = object

    # Print the data types for each column
    for col, data_type in column_data_types.items():
        print(f"{col}: {data_type.__name__}")

In [83]:
check_datatypes(rows,header)

DATE: object
STORE_NBR: str
LYLTY_CARD_NBR: str
TXN_ID: str
PROD_NBR: str
PROD_NAME: str
PROD_QTY: str
TOT_SALES: str
