# **Dataset Cleaning - Accounts**

This notebook is designed to perform **data cleaning** on the **accounts** table that is part of the CRM dataset for the analysis project.

In [10]:
# Import library
import pandas as pd
import re

In [None]:
#Load the dataset
accounts = pd.read_csv('/Users/Gio Noga/Documents/Data Analysis 101/repos/gn-data-crm_pricing_analysis/raw_dataset/accounts.csv')

#Load issues dataset
issues = pd.read_csv('/Users/Gio Noga/Documents/Data Analysis 101/repos/gn-data-crm_pricing_analysis/dataset_validation/data_quality_check/raw_data_issues.csv')

In [12]:
# Create a copy of the original dataset
accounts_cleaned = accounts.copy()
accounts_cleaned.head(3)

Unnamed: 0,account,sector,year_established,revenue,employees,office_location,subsidiary_of
0,Acme Corporation,technolgy,1996,1100.04,2822,United States,
1,Betasoloin,medical,1999,251.41,495,United States,
2,Betatech,medical,1986,647.18,1185,Kenya,


### **To-Do list based on data validation findings**

In [13]:
pd.options.display.max_colwidth = None
issues[issues['dataset'] == 'accounts']

Unnamed: 0,dataset,column,issue,details,action,status
0,accounts,account,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the account names to have the first letter capitalized.,Resolved
1,accounts,sector,Spelling inconsistency,Found 'technolgy' instead of 'technology',standardize spelling,Resolved
2,accounts,sector,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the sector names to have the first letter capitalized.,Resolved
3,accounts,office_location,Spelling inconsistency,Found 'Philipines' instead of 'Philippines',standardize spelling,Resolved


### **Fix Inconsistent Formating for Values for account and sector columns**

In [14]:
# Capitalize accout sector values
accounts_cleaned['sector'] = (
    accounts_cleaned['sector'].str[:1].str.upper() + 
    accounts_cleaned['sector'].str[1:]
)

In [15]:
# Validate
accounts_cleaned['sector'].value_counts(dropna=False)

sector
Retail                17
Technolgy             12
Medical               12
Marketing              8
Finance                8
Software               7
Entertainment          6
Telecommunications     6
Services               5
Employment             4
Name: count, dtype: int64

In [16]:
# Capitalize account 'dambase' value
accounts_cleaned['account'] = (
    accounts_cleaned['account'].str[:1].str.upper() + 
    accounts_cleaned['account'].str[1:]
)

In [25]:
# Account validation function
def validate_account(account):

    if pd.isna(account):
        return "null_value"

    name = str(account).strip()

    # Rule 1: At least one word
    tokens = name.split()
    if len(tokens) < 1:
        return "no_words"

    # Rule 2: Allowed characters only
    if not re.fullmatch(r"[A-Za-z0-9\s&\-\.\']+", name):
        return "invalid_characters"

    # Rule 3: First letter of first word must be uppercase
    if not tokens[0][0].isupper():
        return "first_word_not_capitalized"

    return "valid"

# Run validation (NO column added)
validation_results = accounts_cleaned['account'].apply(validate_account)

# Summary of validation
print("Validation Results:")
print(validation_results.value_counts())

# Extract issues only
account_validation_issues = (
    accounts_cleaned
    .assign(account_validation=validation_results)
    .loc[validation_results != 'valid']
    .groupby(['account', 'account_validation'])
    .size()
    .reset_index(name='affected_rows')
)

# Output
if not account_validation_issues.empty:
    print("\nIssues found:")
    print(account_validation_issues)
else:
    print("\nPassed")



Validation Results:
account
valid    85
Name: count, dtype: int64

Passed


In [18]:
# Update issues dataset to mark issue as resolved
issues.loc[
    (issues['dataset'] == 'accounts') &
    (issues['issue'] == 'Inconsistent Formatting'),
    'status'
] = 'Resolved'

pd.options.display.max_colwidth = None
issues[issues['dataset'] == 'accounts']

Unnamed: 0,dataset,column,issue,details,action,status
0,accounts,account,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the account names to have the first letter capitalized.,Resolved
1,accounts,sector,Spelling inconsistency,Found 'technolgy' instead of 'technology',standardize spelling,Resolved
2,accounts,sector,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the sector names to have the first letter capitalized.,Resolved
3,accounts,office_location,Spelling inconsistency,Found 'Philipines' instead of 'Philippines',standardize spelling,Resolved


### **Fix Spelling Issue for Specific Values**

In [19]:
# Fix typo in sector
accounts_cleaned.loc[
    accounts_cleaned['sector'] == 'Technolgy',
    'sector'
] = 'Technology'

In [20]:
# Validate
accounts_cleaned['sector'].value_counts(dropna=False)

sector
Retail                17
Technology            12
Medical               12
Marketing              8
Finance                8
Software               7
Entertainment          6
Telecommunications     6
Services               5
Employment             4
Name: count, dtype: int64

In [21]:
# Fix typo in office_location
accounts_cleaned.loc[
    accounts_cleaned['office_location'] == 'Philipines',
    'office_location'
] = 'Philippines'

In [22]:
# Validate
accounts_cleaned['office_location'].value_counts(dropna=False)

office_location
United States    71
Kenya             1
Philippines       1
Japan             1
Italy             1
Norway            1
Korea             1
Jordan            1
Brazil            1
Germany           1
Panama            1
Belgium           1
Romania           1
Poland            1
China             1
Name: count, dtype: int64

In [None]:
# Update issues dataset to mark issue as resolved
issues.loc[
    (issues['dataset'] == 'account') &
    (issues['issue'] == 'Spelling inconsistency'),
    'status'
] = 'Resolved'

issues.to_csv('/Users/Gio Noga/Documents/Data Analysis 101/repos/gn-data-crm_pricing_analysis/dataset_validation/data_quality_check/raw_data_issues.csv', index=False)

pd.options.display.max_colwidth = None
issues[issues['dataset'] == 'accounts']

Unnamed: 0,dataset,column,issue,details,action,status
0,accounts,account,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the account names to have the first letter capitalized.,Resolved
1,accounts,sector,Spelling inconsistency,Found 'technolgy' instead of 'technology',standardize spelling,Resolved
2,accounts,sector,Inconsistent Formatting,The first letter of the value is not capitalized.,Standardize the sector names to have the first letter capitalized.,Resolved
3,accounts,office_location,Spelling inconsistency,Found 'Philipines' instead of 'Philippines',standardize spelling,Resolved


In [24]:
# Save the cleaned dataframe to a new CSV file
accounts_cleaned.to_csv('C:\\Users\\Gio Noga\\Documents\\Data Analysis 101\\repos\\gn-data-crm_pricing_analysis\\clean_dataset/01_accounts_cleaned.csv', index=False)