# Cleaning Data from Customer List 

In [None]:
import numpy as np
import pandas as pd

In [None]:
with open("customer_list_updated.csv", "r") as f:
    print(f.readlines(150))

Used the open() function to read the file and see what I'm working with. Immediately noticed values were separated with pipes "**|**" instead of commas "**,**" .
This is something I will have to specify as the delimiter when I import it as dataframe.

Also, noticed a BOM(Byte-Order-Mark) **ï»¿** at the beginning of the code, which isn't shown in a notepad or VSC when I opened it and it's used for UTF files. This will cause an issue when importing if I dont encode it as UTF-8. 

For the sake of this project, since I need to cleanup the data, I will remove the BOM and replace the "|" with "," so it's simpler to import into a dataframe without needing to specify the delimiter and encoding it as a UTF-8

In [None]:
with open("customer_list_updated.csv", "r") as f:
    content = f.read()

# Manually remove BOM (first 3 indexes)
content = content[3:] 

# Replace "|" with ","
content = content.replace("|", ",")

# Write the new content back to a new file
with open("customer_list_ready.csv", "w") as f:
    f.write(content)

# Print to verify that BOM is removed and values are separated by commas
print(content)

In [None]:
# Creating a dataframe with new csv file, specifying the 'cust_id' column as index
df = pd.read_csv("customer_list_ready.csv", index_col='cust_id')

In [None]:
# Displaying the first 10 rows of df
df.head(10)

***
Noticed right away in the first 10 rows that the names have special characters:
**&!** for first name initials and **.^** for last name initials

I will create a code to find all names and index# to see which characters I will need to remove
***

In [None]:
# Loop through each row in the 'name' column
for index, name in df['name'].items():
    valid_name = True

    for char in name:
        # Check if the character is not a letter and not a space
        if not (char.isalpha() or char == ' '):
            valid_name = False
            break

    # If the name is invalid, print the row's index and name
    if not valid_name:
        print(f"Index {index}: {name}")

***
By creating and running this function, I noticed there were many names that had characters. 

Most of them had the **&!** characters in the first name for initials and **.^** for last name initials. 

I also noticed for row 301, Miles **O[']Brien** had brackets around the apostrophe for his last name so I will specify to remove the brackets.

There were also some customers that had a correct special character: hyphens **-** , so I will make sure not to remove those
***

In [None]:
# Remove specific unwanted characters in the 'name' column like "&!", "^", and "[']"
for char in ['&!']:
    df['name'] = df['name'].str.replace(char, '.')   # Replacing with a period

for char in ['^']:
    df['name'] = df['name'].str.replace(char, '') 

for char in ["[']"]:
    df['name'] = df['name'].str.replace(char, "'")   # Removing the square brackets and reinstating an apostrophe

In [None]:
# Checking the first 10 rows to see if it changed the characters
df.head(10) 

In [None]:
# Using .info() to see how many rows are there and any other information I need
df.info() 

***
From the info summary, I can deduce that there are 521 entries (rows).
The "phone" and "sms-opt-out" columns are only showing 520 non-null entries so they probably have one NaN value each
***

In [None]:
# Summing up all null values for each column
df.isnull().sum()

In [None]:
# Locating which row has the NaN in the phone column
df.loc[df['phone'].isna()] 

***
Found out that row 301 has both NaN values I was looking for. I will go ahead and add values into the phone and sms columns for row 301. I rather do this than drop the row since it still has valuable data.

***

In [None]:
df.loc[301, 'phone'] = '000-000-0000'  

In [None]:
df.loc[301, 'sms-opt-out'] = 'Y'  # Putting Y for opt-out since it's an invalid #

In [None]:
df.loc[[301]]

***
Upon reviewing row 301 after my changes, I noticed that I created another column when I wanted to input Y for sms-opt-out. 

Although they're the same spelling, I think it created an additional column since the og must have a space before or after
***

In [None]:
# Checking the og column for the name
df.columns[5]

***
Confirmed that the original column for sms-opt-out had a space after so I will delete the column I created and delete the space in the original column
***

In [None]:
# Dropping the column I made
df.drop('sms-opt-out', axis=1, inplace=True)

In [None]:
# Stripping the space in all the columns, just in case
df.columns = df.columns.str.strip()

In [None]:
# Confirming column has no space
df.columns[5]

In [None]:
# Running my code again 
df.loc[301, 'sms-opt-out'] = 'Y'  # Putting Y for opt-out since it's an invalid #

In [None]:
df.loc[[301]]

***
It worked! I will now go column by column performing a few codes to see if there are any discrepancies I need to correct
***

In [None]:
# Starting with the date column

# Start by stripping any spaces
df['date'] = df['date'].str.strip()

# Looping through each row in the 'date column and check if the date format is correct
for index, date in df['date'].items():
    valid_date = True

    # If date doesn't match format below, it will raise the flag false
    if len(date) != 10 or date[4] != '-' or date[7] != '-':
        valid_date = False

    # Checking if dates are digits only (no special characters besides hyphen and no letters)
    if not date[:4].isdigit() or not date[5:7].isdigit() or not date[8:].isdigit():
        valid_time = False
        
    # If any came out invalid, it will print below
    if not valid_date:
        print(f"Index {index}: {date}")

In [None]:
# Since nothing printed, all dates were inputted correctly

df.head()

In [None]:
# Running a similar code for the time column

# Start by stripping any spaces
df['time'] = df['time'].str.strip()

# Looping through each row in the 'time' column and check if the time format is correct
for index, time in df['time'].items():
    valid_time = True

    # Check if the time format is exactly 'HH:MM:SS' (length of 8, and correct position of ':')
    if len(time) != 8 or time[2] != ':' or time[5] != ':':
        valid_time = False

    # Check if each part of the time is numeric
    if not time[:2].isdigit() or not time[3:5].isdigit() or not time[6:].isdigit():
        valid_time = False

    # If any row is invalid, it will print below
    if not valid_time:
        print(f"Index {index}: {time}")

In [None]:
# Again, since nothing printed, all dates were inputted correctly:

df.head()

In [None]:
# Will skip the name column since it's already been cleaned and move on to the email column:

# Loop through each email in the 'email' column
for index, email in df['email'].items():
    # Remove extra spaces (leading and trailing)
    cleaned_email = email.strip()

    # Converting email to lowercase
    cleaned_email = cleaned_email.lower()

    # Check if email doesn't contain '@' symbol AND a '.'
    if not'@' in cleaned_email or not '.' in cleaned_email: 
         print(f"Index {index}: {email}")

In [None]:
# Nothing printed so email column is clean so I will move onto the phone column:

# Start by stripping any spaces
df['phone'] = df['phone'].str.strip()

# Loop through each phone number in the 'phone' column
for index, phone in df['phone'].items():
    valid_phone = True
    
    # Check if the phone number has exactly 12 characters (fitting this format: 123-456-7890)
    if len(phone) != 12:
        valid_phone = False

    # Step 2: Check if dashes are in the correct places
    if phone[3] != '-' or phone[7] != '-':
        valid_phone = False
    
    # Step 3: Check if the characters are numbers where they should be
    if not phone[:3].isdigit() or not phone[4:7].isdigit() or not phone[8:].isdigit():
        valid_phone = False

    # If the phone is invalid, print it
    if not valid_phone:
        print(f"Index {index}: {phone}")

***
Finally, found an error in the phone column from rows 286-295! 

All phone #'s have a 1 before the number (specifying the country code).
I will remove it to keep things uniform and clean
***

In [None]:
# Loop through each phone number in the 'phone' column
for index, phone in df['phone'].items():
    # Check to find the phone numbers that are 13 characters long
    if len(phone) == 13:
        # Removing the first character and update the phone number in the DataFrame
        df.loc[index, 'phone'] = phone[1:]

# printing rows from 286 to 295 to confirm
df.loc[286:295]

***
Great! Phones have been updated and cleaned. Last, but not least, I will clean the last column
***

In [None]:
# Start by stripping any spaces
df['sms-opt-out'] = df['sms-opt-out'].str.strip()

# Replace NaN values with 'Y' as default for customers to opt-out
df['sms-opt-out'] = df['sms-opt-out'].fillna('Y')

# Converting all entries to uppercase for uniformity
df['sms-opt-out'] = df['sms-opt-out'].str.upper()

# Checking for invalid entries
for index, value in df['sms-opt-out'].items():
    if value not in ['Y', 'N']:
        print(f"Index {index}: {value}")

All columns have cleaned, I will now check for any duplicates, just in case

In [None]:
# Check for duplicates in the entire DataFrame
print(df[df.duplicated()])

***
Great! All columns have cleaned!

As a summary, I initally removed the BOM, and changed the delimiter from pipes **|** to commas **,**

Then, I removed all the special characters in the names since that was what caught my eye first.

I proceeded to check the info() summary to see how many rows I was working with and see if there were any nulls 

Removed the nulls in the phone and sms-opt-out column and replaced them with values, while also stripping spaces in the columns to not create new columns by accident

I then, checked column by column if there were any formatting issues and found one last error in the phones column a set of numbers with 13 characters instead of 12. I stripped the numbers down to 12.

Finally, the data has been cleaned and I will now export the cleaned dataframe into a file called customer_list_cleaned.csv, keeping the index true since I started with column 'cust_id" as my index
***

In [None]:
df.to_csv('customer_list_cleaned.csv', index=True)