# GOAL : 
## FIND THE TYPE OF CUSTOMERS WHO PURCHASE CHIPS AND THEIR PURCHASING BEHAVIOUR WITHIN THE REGION

# PART ONE

## Examine Transaction Data
1. Look for inconsistencies, missing data across the dataset, outliers, correctly identified category items, numeric data across all tables.
2. In case of identified anomalies, make necessary changes to the dataset and save it.

In [106]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chardet
from datetime import datetime, timedelta

In [107]:
# encoding used in the file
with open ("QVI_transaction_data.csv", mode='rb') as file:
    raw_bytes = file.read(5)
    detected_encoding = chardet.detect(raw_bytes)['encoding']
    print(detected_encoding)

ascii


In [None]:
# convert from ascii to utf-8
with open("QVI_transaction_data.csv", encoding='utf-8') as file:
    rows = list(csv.reader(file))
    header = rows[0]
    print(rows[:10])

# INSIGHTS

1. **ASCII** encoding is used in the csv file.
2. The Date Column doers not look like a date.
3. Numbers are represented as strings. 
4. Format headers to have the correct spelling and start with uppercase only.
5. Separate quantity from product name 

In [None]:
# explore the dataset
def explore_dataset(dataset,start,end,rows_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print("\n")
    if rows_columns:
        print("There are {} rows".format(len(dataset)))
        print("There are {} columns".format(len(dataset[0])))
    

In [None]:
explore_dataset(rows,0,10,True)

In [None]:
# check if there are empty sublists in our list
empty_rows =  [sublist for sublist in rows if sublist]
explore_dataset(empty_rows,0,5,True)

In [None]:
# check for duplicates based on key columns
duplicate_entries = []
seen = set()

for row in rows:
#     use a tuple as a key
    key = tuple(row)
    if key in seen:
        duplicate_entries.append(row)
    else:
        seen.add(key)
# duplicate entries
duplicates = []
for duplicate in duplicate_entries:
    duplicates.append(duplicate)
    print(duplicates)
print("There are {} duplicate entries".format(len(duplicates)))

1 duplicate entry to be removed

In [None]:
# get the index of every column
col_index = {}
for i in range(len(header)):
    col_index[header[i]] = i
print(col_index)

In [None]:
# convert excel date to python datetime object
def excel_serial_date_to_datetime(serial_date):
    base_date = datetime(1899, 12, 31)
    delta = timedelta(days=float(serial_date))
    resulting_date = base_date + delta
    return resulting_date

def serialdate_to_datetime(rows, idx):
    for i, row in enumerate(rows):
        if i == 0:
            continue  # Skip header if present
        serial_date = row[idx]
        date = excel_serial_date_to_datetime(serial_date)
        row[idx] = date
    return rows

In [None]:
rows = serialdate_to_datetime(rows, 0)
print(rows[:5])

I will write to the csv file later, to keep permanent changes of the date column

In [None]:
# check if the data types match
def check_datatypes(rows,header):    
    # Initialize a dictionary to store the data types for each column
    column_data_types = {col: None for col in header}

    # Iterate over each row of the CSV file
    for row in rows:
        # Iterate over each column in the row
        for i, col_value in enumerate(row):
            # Check if the data type for the column has been set yet
            if not column_data_types[header[i]]:
                # If not, set the data type to the type of the current value
                column_data_types[header[i]] = type(col_value)
            else:
                # If it has been set, check if the current value has a different data type
                if column_data_types[header[i]] != type(col_value):
                    # If it does, set the data type to a generic "object" type
                    column_data_types[header[i]] = object

    # Print the data types for each column
    for col, data_type in column_data_types.items():
        print(f"{col}: {data_type.__name__}")

In [None]:
check_datatypes(rows,header)

Columns that should be integers are represented as strings:
1. Store Number
2. Loyalty Card Number
3. Prod Number 

The Date Column will also be converted to a datetime object

In [None]:
# # change Product Name column to just the product name and create another column for Quantity in grams.
# PROD_NAME = []
# PROD_QUANTITY = []

# for row in rows:
#     products = row[5]
#     for product in products:
# #     split the string into words
#         split_product = product.rsplit(' ',1)
#         PROD_NAME.append(split_product[0])
#         PROD_QUANTITY.append(split_product[1])
# for i in range(len(PROD_NAME)):
#     print(f"Product:{PROD_NAME[i]}, Quantity:{PROD_QUANTITY[i]}")

We have created a new list of two separate columns that will be put in the csv file when writing.

In [None]:
# write to a new csv file
with open('transaction1.csv', 'w',newline='', encoding='utf-8') as newfile:
    writer = csv.writer(newfile)
    writer.writerows(rows)

In [None]:
# convert from ascii to utf-8
with open("transaction1.csv", encoding='utf-8') as file:
    rows = list(csv.reader(file))
    header = rows[0]
    print(rows[:10])