<font size="6.5"><b> TDD workflow </b></font>

# Testing & Cleaning functions

## Importing libraries for testing

In [20]:
import pandas as pd
from dateutil.parser import parse  # <-- For dates
import re                          # <-- For phone numbers

import pprint                      # <-- PrettyPrint: For printing summary dictionary
from collections import OrderedDict # For reordering the PrettyPrint dictionary

In [21]:
# If the notebook is opened from the "notebooks" folder, we need to append the main directory to the "python path" so it sees all subfolders.
import sys
sys.path.append('../')

In [22]:
# Importing my cleaning functions
#from scripts.cleaning_functions import clean_dates, clean_phone, clean_geo, clean_email, clean_payment, extract_currency, clean_boolean, clean_name

# Importing my testing functions
from testing.test_functions import create_dirty_and_expected_data, test_cleaning, test_cleaning_report, test_cleaning_report_2, test_cleaning_report_3

In [23]:
# Creating a blank dataframe to store the messy and cleaned test-data
df = pd.DataFrame(index=range(25))
df_report = pd.DataFrame(index=[0])

## Preparation for testing

In [25]:
# Creating a blank dataframe to store the messy and cleaned test-data
df = pd.DataFrame(index=range(20))

In [26]:
# The function to create test-cases

def create_dirty_and_expected_data(df, dirty_column, expected_column, dirty_data_list, expected_data_list):
    #Input arguments: 
        # 1. a DataFrame which the test data will be appended to, 
        # 2. name for the 'Dirty' column (str), 
        # 3. name for the 'Expected' column (str), 
        # 4. dirty data (list), 
        # 5. expected data (list)
    #important assumption: df initially needs to be longer than the cases we define later
    df[dirty_column] = dirty_data_list + [None] * (len(df) - len(dirty_data_list))
    df[expected_column] = expected_data_list + [None] * (len(df) - len(expected_data_list))

In [27]:
# The function for applying any cleaning function, then comparing the result with the expected values

def test_cleaning(clean_function, df, dirty_column, expected_column):
    # Apply the cleaning function to the dirty column
    cleaned_data = df[dirty_column].apply(clean_function)
    
    # Compare the cleaned data with the expected data
    comparison_result = cleaned_data == df[expected_column]
    
    # Print the comparison result
    print("Comparison Result:")
    for idx, (cleaned_value, expected_value) in enumerate(zip(cleaned_data, df[expected_column])):
        if cleaned_value != expected_value:
            print(f"(Row {idx}) Cleaned Value: {cleaned_value} {type(cleaned_value)}, Expected Value: {expected_value} {type(expected_value)}, ")
    
    return comparison_result.all()

## Dealing with: Dates

In [6]:
from scripts.clean_date import clean_date

In [7]:
# Creating uncleaned data and the expected data, to test our cleaning function

create_dirty_and_expected_data(
    df,
    'dirty_dates',
    'expected_dates',
    [
        'foo',
        '2021-01-15',
        '2021/02/20',
        '2021.03.25',
        '2021 04 30',
        '31-01-2022',
        '02/29/2023',  # Invalid date (leap year)
        '2023.04.31',  # Invalid date (April 31st)
        '2023-13-25',  # Invalid month (13)
        '2023.11.32',  # Invalid day (November 32nd)
        '2023/15/01',  # Incorrect order of elements
        '1st Jan 2024', # Textual date
        'Jan 15, 2024', # Textual date
        '2024 February 25', # Textual date
        '25th of March, 2024', # Textual date
        'March 32nd, 2024', # Invalid day (March 32nd)
        'April 5th, 2024', # Textual date
        '20-30-2025',  # Ambiguous format
        #'15/02/26',  # Ambiguous format
        #'2026.25.03',  # Ambiguous format
        #'2027/07/08',  # Ambiguous format
    ],
    [
        'This_is_an_intentional_false_negative',
        '2021-01-15',
        '2021-02-20',
        '2021-03-25',
        '2021-04-30',
        '2022-01-31',
        None,  # Expected None for invalid date (02/29/2023)
        None,  # Expected None for invalid date (2023.04.31)
        None,  # Expected None for invalid date (2023-13-25)
        None,  # Expected None for invalid date (2023.11.32)
        '2023-01-15',  # Expected cleaned date for incorrect order of elements
        '2024-01-01',  # Expected cleaned date for textual date (1st Jan 2024)
        '2024-01-15',  # Expected cleaned date for textual date (Jan 15, 2024)
        '2024-02-25',  # Expected cleaned date for textual date (2024 February 25)
        '2024-03-25',  # Expected cleaned date for textual date (25th of March, 2024)
        None,  # Expected None for invalid date (March 32nd, 2024)
        '2024-04-05',  # Expected cleaned date for textual date (April 5th, 2024)
        None,  # Ambiguous format (20-30-2025)
        #None,  # Ambiguous format (15/02/26)
        #None,  # Ambiguous format (2026.25.03)
        #None,  # Ambiguous format (2027/07/08)
    ]
)

In [8]:
# Run the testing function
result = test_cleaning_report(clean_date, df, df_report, 'dirty_dates', 'expected_dates')

# Printing the cleaning result
#print(pprint.pformat(OrderedDict(cleaning_summary_date), indent=4))

Comparison Result:
(Row 0) Cleaned Value: None <class 'NoneType'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 
(Row 10) Cleaned Value: None <class 'NoneType'>, Expected Value: 2023-01-15 <class 'str'>, 


In [11]:
# Running the actual cleaning
df['cleaned_dates'] = df['dirty_dates'].apply(clean_date, df_report=df_report, dirty_column_name='dirty_dates')
print(df_report)

   dirty_dates_nulls_encountered  dirty_dates_parsing_success  \
0                              2                           10   

   dirty_dates_parsing_failed  
0                           8  


### dates: delete

In [26]:
# Define a global dictionary to track cleaning actions
#IMPORTANT: Needs to be resetted to 0, before .apply()-ing each cleaning_function in order to show valid counters
cleaning_summary_date = {
    "null_values_encountered": 0,
    "date_parsing_success": 0,
    "date_parsing_failed": 0,
}

In [18]:
# the cleaning function

def clean_dates(x):
    global cleaning_summary_date
    
    # Skip Null values
    if pd.isna(x):
        cleaning_summary_date["null_values_encountered"] += 1
        return None
    
    try:
        # Attempt to parse the date using dateutil.parser.parse
        date_obj = parse(x, fuzzy=True)
        cleaning_summary_date["date_parsing_success"] += 1
        return date_obj.strftime('%Y-%m-%d')  # Convert date to YYYY-MM-DD format
    except Exception as e:
        cleaning_summary_date["date_parsing_failed"] += 1
        return None  # Return None if parsing fails

In [36]:
# Run the testing function
result = test_cleaning(clean_dates, df, 'dirty_dates', 'expected_dates')

# Printing the cleaning result
print(pprint.pformat(OrderedDict(cleaning_summary_date), indent=4))

NameError: name 'cleaning_summary_date' is not defined

In [45]:
# Running the actual cleaning
df['cleaned_dates'] = df['dirty_dates'].apply(clean_dates)
print(df)

NameError: name 'cleaning_summary_date' is not defined

## Dealing with: Phone Numbers

In [16]:
from scripts.clean_phone import clean_phone

In [51]:
# Creating uncleaned data and the expected data, to test our cleaning function

create_dirty_and_expected_data(
    df,
    'dirty_phone',
    'expected_phone',
    [
        'foo',
        '+1 (123) 456-7890',
        '123-456-7890',
        '(111) 222 3333',
        '+44 1234 567890',
        '001-345-678-9012',
        '555-5555',
        '1234567890',
        '+1 234 567 8901 ext. 1234',
        'invalid_phone_number',
        '123-456-7890 x123',
        '234-567-8901 ext. 1234',
        '(+1) 1234567890',
        '+1 (1234) 567-890',
        '1234-567-890',
        '(123) 456-7890',
        '123.456.7890',
        '+123 456 7890',
        '123456789',  # Too short
        '+12345678901',  # Too long
        '+1 (123) 456-789A',  # Invalid character
    ],
    [
        'This_is_an_intentional_false_negative',
        '+1 (123) 456-7890',
        '123-456-7890',
        '(111) 222 3333',
        '+44 1234 567890',
        '001-345-678-9012',
        '555-5555',
        '1234567890',
        '+1 234 567 8901 ext. 1234',
        None,  # For 'invalid_phone_number'
        '123-456-7890 x123',
        '234-567-8901 ext. 1234',
        None,  # For '(+1) 1234567890'
        '+1 (1234) 567-890',
        '1234-567-890',
        '(123) 456-7890',
        '123.456.7890',
        '+123 456 7890',
        None,  # For '123456789'
        None,  # For '+12345678901'
        None   # For '+1 (123) 456-789A'
    ]
)

In [52]:
# Run the testing function
result = test_cleaning_report(clean_phone, df, df_report, 'dirty_phone', 'expected_phone')

Comparison Result:
(Row 0) Cleaned Value: None <class 'NoneType'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 
(Row 1) Cleaned Value: 11234567890 <class 'str'>, Expected Value: +1 (123) 456-7890 <class 'str'>, 
(Row 2) Cleaned Value: +11234567890 <class 'str'>, Expected Value: 123-456-7890 <class 'str'>, 
(Row 3) Cleaned Value: +11112223333 <class 'str'>, Expected Value: (111) 222 3333 <class 'str'>, 
(Row 4) Cleaned Value: 441234567890 <class 'str'>, Expected Value: +44 1234 567890 <class 'str'>, 
(Row 5) Cleaned Value: 0013456789012 <class 'str'>, Expected Value: 001-345-678-9012 <class 'str'>, 
(Row 6) Cleaned Value: None <class 'NoneType'>, Expected Value: 555-5555 <class 'str'>, 
(Row 7) Cleaned Value: +11234567890 <class 'str'>, Expected Value: 1234567890 <class 'str'>, 
(Row 8) Cleaned Value: 12345678901 <class 'str'>, Expected Value: +1 234 567 8901 ext. 1234 <class 'str'>, 
(Row 10) Cleaned Value: 1234567890123 <class 'str'>, Expected Value: 123-456-7

In [35]:
# Running the actual cleaning
df['cleaned_phone'] = df['dirty_phone'].apply(clean_phone, df_report=df_report, dirty_column_name='dirty_phone')
print(df_report)

   dirty_dates_nulls_encountered  dirty_dates_parsing_success  \
0                              3                           10   

   dirty_dates_parsing_failed  dirty_phonenull_values_encountered  \
0                           7                                   0   

   dirty_phonevalid_phone_numbers  dirty_phoneinvalid_phone_numbers  \
0                              17                                 3   

   dirty_geo_null_values_encountered  dirty_geo_leading_trailing_whitespace  \
0                                 15                                      1   

   dirty_geo_converted_to_float  dirty_geo_ValueError --> None  ...  \
0                             3                              2  ...   

   dirty_email_leading_trailing_whitespace  dirty_email_spaces_in_domain_name  \
0                                        1                                  3   

   dirty_email_double_periods_in_domain_name  dirty_email_double_at_symbols  \
0                                          

### Phones: Delete

In [47]:
# Define a global dictionary to track cleaning actions
#IMPORTANT: Needs to be resetted to 0, before .apply()-ing each cleaning_function in order to show valid counters
cleaning_summary_phone = {
    "null_values_encountered": 0,
    "date_parsing_success": 0,
    "date_parsing_failed": 0,
}

In [50]:
# The cleaning function

def clean_phone(phone_number):
    global cleaning_summary_phone
    
    # Skip Null values
    if pd.isna(date_str):
        cleaning_summary_phone["null_values_encountered"] += 1
        return None
    
    # Regular expression pattern to match valid phone numbers
    pattern = r'\+?[0-9]+(?:\s*[\-()x.]?\s*[0-9]+)*'
    
    # Find all phone number matches in the input string
    matches = re.findall(pattern, phone_number)
    
    # If no matches found, return None
    if not matches:
        return None
    
    # Select the first match as the cleaned phone number
    cleaned_phone_number = matches[0]
    
    # Remove non-numeric characters
    cleaned_phone_number = re.sub(r'\D', '', cleaned_phone_number)
    
    # Check if the cleaned phone number has a valid length
    if len(cleaned_phone_number) < 10 or len(cleaned_phone_number) > 15:
        return None
    
    # Add country code if missing
    if len(cleaned_phone_number) == 10:
        cleaned_phone_number = '+1' + cleaned_phone_number
    
    return cleaned_phone_number

In [51]:
# Running the actual cleaning
df['cleaned_phone'] = df['dirty_phone'].apply(clean_phone)
print(df)

NameError: name 'date_str' is not defined

In [53]:
data = {
    'dirty_phone_numbers': [
        '+1 (123) 456-7890',
        '123-456-7890',
        '(111) 222 3333',
        '+44 1234 567890',
        '001-345-678-9012',
        '555-5555',
        '1234567890',
        '+1 234 567 8901 ext. 1234',
        'invalid_phone_number',
        '123-456-7890 x123',
        '234-567-8901 ext. 1234',
        '(+1) 1234567890',
        '+1 (1234) 567-890',
        '1234-567-890',
        '(123) 456-7890',
        '123.456.7890',
        '+123 456 7890',
        '123456789',  # Too short
        '+12345678901',  # Too long
        '+1 (123) 456-789A',  # Invalid character
    ]
}

df = pd.DataFrame(data)
print(df)

          dirty_phone_numbers
0           +1 (123) 456-7890
1                123-456-7890
2              (111) 222 3333
3             +44 1234 567890
4            001-345-678-9012
5                    555-5555
6                  1234567890
7   +1 234 567 8901 ext. 1234
8        invalid_phone_number
9           123-456-7890 x123
10     234-567-8901 ext. 1234
11            (+1) 1234567890
12          +1 (1234) 567-890
13               1234-567-890
14             (123) 456-7890
15               123.456.7890
16              +123 456 7890
17                  123456789
18               +12345678901
19          +1 (123) 456-789A


In [54]:
# Test the cleaning function
test_phone_number = '+1 (123) 456-7890'
cleaned_phone_number = clean_phone_numbers(test_phone_number)
print(cleaned_phone_number)  # Output: +11234567890

NameError: name 'clean_phone_numbers' is not defined

## Dealing with: Geolocation data

In [19]:
from scripts.clean_geolocation import clean_geo

In [53]:
# Creating uncleaned data and the expected data, to test our cleaning function
create_dirty_and_expected_data(
    df,
    'dirty_geo',
    'expected_geo',
    [
        'foo',
        '89.123456',           # Valid latitude
        '-91.5678',            # Invalid latitude (out of range)
        'xyz',                 # Invalid latitude (non-numeric)
        '45.678.90',           # Invalid latitude (contains multiple dots)
        '  -12.345  ',         # Dirty latitude with leading and trailing spaces
    ],
    [
        'This_is_an_intentional_false_negative',
        89.123456,   # Valid latitude
        None,          # Invalid latitude (out of range)
        None,          # Invalid latitude (non-numeric)
        None,          # Invalid latitude (contains multiple dots)
        -12.345,     # Cleaned latitude (leading and trailing spaces removed)
    ]
)

In [54]:
# Run the testing function
result = test_cleaning_report(clean_geo, df, df_report, 'dirty_geo', 'expected_geo')

Comparison Result:
(Row 0) Cleaned Value: nan <class 'float'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 
(Row 2) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 3) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 4) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 6) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 7) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 8) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 9) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 10) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 11) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row 12) Cleaned Value: nan <class 'float'>, Expected Value: None <class 'NoneType'>, 
(Row

In [None]:
""" NaN vs. None"""

df['cleaned_geo'].replace({pd.np.nan: None}, inplace=True)
print(df)

In [14]:
df['cleaned_geo'] = df['dirty_geo'].apply(clean_geo, df_report=df_report, dirty_column_name='dirty_geo')
print(df_report)

   dirty_dates_nulls_encountered  dirty_dates_parsing_success  \
0                              3                           10   

   dirty_dates_parsing_failed  dirty_phonenull_values_encountered  \
0                           7                                   0   

   dirty_phonevalid_phone_numbers  dirty_phoneinvalid_phone_numbers  \
0                              17                                 3   

   dirty_geo_null_values_encountered  dirty_geo_leading_trailing_whitespace  \
0                                 15                                      1   

   dirty_geo_converted_to_float  dirty_geo_ValueError --> None  \
0                             3                              2   

   dirty_geo_string --> None  dirty_geo_Number out of scope  
0                          0                              1  


### Geo: Delete

In [56]:
# Define a global dictionary to track cleaning actions
#IMPORTANT: Needs to be resetted to 0, before .apply()-ing each cleaning_function in order to show valid counters
cleaning_summary_geo = {
    "null_values_encountered": 0,
    "leading_trailing_whitespace": 0,
    "converted_to_float": 0,
    "ValueError --> None": 0,
    "string --> None": 0,
    "Number out of scope": 0,   
}

In [37]:
def clean_geo(x):
    #global cleaning_summary_geo 
    
    # Skip Null values
    if pd.isna(x):
        cleaning_summary_geo["null_values_encountered"] += 1
        return None

    # Whitespace stripping
    original_x = x
    x = x.strip()
    if x != original_x:
        cleaning_summary_geo["leading_trailing_whitespace"] += 1
    
    # Trying to convert to float, except returning None if it fails
    try:
        original_x = x
        x = float(x)
        if x != original_x:
            cleaning_summary_geo["converted_to_float"] += 1
    except ValueError:
        cleaning_summary_geo["ValueError --> None"] += 1
        return None

    #fails to convert to float, they remain a string, and none
    # Check if it's a string
    if isinstance(x, str):
        cleaning_summary_geo["string --> None"] += 1
        return None
    
    
    if not (-90 <= x <= 90):
        cleaning_summary_geo["Number out of scope"] += 1
        return None
    
    return x

In [58]:
result = test_cleaning(clean_geo, df, 'dirty_geo', 'expected_geo')

# Printing the cleaning result
print(pprint.pformat(OrderedDict(cleaning_summary_geo), indent=4))

Comparison Result:
(Row 1) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 2) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 3) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 5) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 6) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 7) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 8) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 9) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 10) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 11) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 12) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 13) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'flo

In [59]:
# Running the actual cleaning
df['cleaned_geo'] = df['dirty_geo'].apply(clean_geo)
df['cleaned_geo'].replace({pd.np.nan: None}, inplace=True)
print(df)

          dirty_phone_numbers    dirty_geo  expected_geo cleaned_geo
0           +1 (123) 456-7890    89.123456     89.123456   89.123456
1                123-456-7890     -91.5678           NaN        None
2              (111) 222 3333          xyz           NaN        None
3             +44 1234 567890    45.678.90           NaN        None
4            001-345-678-9012    -12.345      -12.345000     -12.345
5                    555-5555         None           NaN        None
6                  1234567890         None           NaN        None
7   +1 234 567 8901 ext. 1234         None           NaN        None
8        invalid_phone_number         None           NaN        None
9           123-456-7890 x123         None           NaN        None
10     234-567-8901 ext. 1234         None           NaN        None
11            (+1) 1234567890         None           NaN        None
12          +1 (1234) 567-890         None           NaN        None
13               1234-567-890     

  df['cleaned_geo'].replace({pd.np.nan: None}, inplace=True)


## Dealing with: email adresses

In [24]:
from scripts.clean_email import clean_email

In [55]:
create_dirty_and_expected_data(
    df,
    'dirty_email',
    'expected_email',
    [
    'foo',
    'john.doe@example.com',  # Valid email address
    'jane.doe@example',      # Missing top-level domain
    'invalid.email@',        # Missing domain name
    'test@example',          # Missing top-level domain and domain name
    'test@.com',             # Missing domain name
    'test@example.',         # Missing top-level domain
    'test@com',              # Missing top-level domain separator
    '@example.com',          # Missing local part
    'test@exam ple.com',     # Space in local part
    'test@example .com',     # Space in domain name
    ' test @example .com ',  # Spaces everywhere
    'test@@example.com',     # Double @ symbol
    'test@example..com',     # Double period in domain name
    'test@-example.com',     # Hyphen at the beginning of domain name
    'test@example-.com',     # Hyphen at the end of domain name
    'test@exa_mple.com',     # Underscore in domain name
    'test@[example].com',    # Square brackets in domain name
    'test@example.c'        # Invalid top-level domain
    ],
    [
    'This_is_an_intentional_false_negative',
    'john.doe@example.com',  # Valid email address
    None,      # Missing top-level domain
    None,        # Missing domain name
    None,          # Missing top-level domain and domain name
    None,             # Missing domain name
    None,         # Missing top-level domain
    None,              # Missing top-level domain separator
    None,          # Missing local part
    'test@example.com',     # Space in local part
    'test@example.com',     # Space in domain name
    'test@example.com',     # Spaces everywhere
    'test@example.com',     # Double @ symbol
    'test@example.com',     # Double period in domain name
    'test@example.com',     # Hyphen at the beginning of domain name
    'test@example.com',     # Hyphen at the end of domain name
    'test@exa_mple.com',     # Underscore in domain name
    'test@[example].com',    # Square brackets in domain name
    None        # Invalid top-level domain
    ]
)

In [56]:
# Run the testing function
result = test_cleaning_report(clean_email, df, df_report, 'dirty_email', 'expected_email')

Comparison Result:
(Row 0) Cleaned Value: None <class 'NoneType'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 
(Row 17) Cleaned Value: None <class 'NoneType'>, Expected Value: test@[example].com <class 'str'>, 


In [33]:
df['cleaned_email'] = df['dirty_email'].apply(clean_email, df_report=df_report, dirty_column_name='dirty_email')
print(df_report)

   dirty_dates_nulls_encountered  dirty_dates_parsing_success  \
0                              3                           10   

   dirty_dates_parsing_failed  dirty_phonenull_values_encountered  \
0                           7                                   0   

   dirty_phonevalid_phone_numbers  dirty_phoneinvalid_phone_numbers  \
0                              17                                 3   

   dirty_geo_null_values_encountered  dirty_geo_leading_trailing_whitespace  \
0                                 15                                      1   

   dirty_geo_converted_to_float  dirty_geo_ValueError --> None  ...  \
0                             3                              2  ...   

   dirty_bool_strings_nonconverted  dirty_bool_nonstrings_nonconverted  \
0                                2                                   2   

   dirty_email_null_values_encountered  \
0                                    2   

   dirty_email_leading_trailing_whitespace  dirty_e

### email: Delete

In [66]:
# Define a global dictionary to track cleaning actions
#IMPORTANT: Needs to be resetted to 0, before .apply()-ing each cleaning_function in order to show valid counters
cleaning_summary_email = {
    "null_values_encountered": 0,
    "leading_trailing_whitespace": 0,
    "spaces_in_domain_name": 0,
    "double_periods_in_domain_name": 0,
    "double_at_symbols": 0,
    "hyphens_in_domain_name": 0,
    "invalid_email_pattern": 0,
    "valid_emails": 0
}

In [62]:
# Cleaning function for emails
def clean_email(x):
    #global cleaning_summary_email
    
    # Skip Null values
    if pd.isna(x):
        cleaning_summary_email["null_values_encountered"] += 1
        return None
    
    # Remove leading and trailing whitespace
    original_x = x
    x = x.strip()
    if x != original_x:
        cleaning_summary_email["leading_trailing_whitespace"] += 1
    
    # Remove spaces in the domain name
    original_x = x
    x = re.sub(r'\s+', '', x)
    if x != original_x:
        cleaning_summary_email["spaces_in_domain_name"] += 1
    
    # Remove double periods in the domain name
    original_x = x
    x = re.sub(r'\.{2,}', '.', x)
    if x != original_x:
        cleaning_summary_email["double_periods_in_domain_name"] += 1
    
    # Remove double @ symbols
    original_x = x
    x = re.sub(r'@{2,}', '@', x)
    if x != original_x:
        cleaning_summary_email["double_at_symbols"] += 1
    
    # Remove hyphens at the beginning or end of the domain name
    original_x = x
    x = re.sub(r'(?<!\.)-|-(?![^.])', '', x)
    if x != original_x:
        cleaning_summary_email["hyphens_in_domain_name"] += 1
    
    # Check if the email matches the basic pattern
    if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}$', x):
        cleaning_summary_email["invalid_email_pattern"] += 1
        return None
    else:
        cleaning_summary_email["valid_emails"] += 1

    return x

In [67]:
# Running the test
result = test_cleaning(clean_email, df, 'dirty_email', 'expected_email')
print(result)

# Printing the cleaning result
print(pprint.pformat(OrderedDict(cleaning_summary_email)))

Comparison Result:
(Row 16) Cleaned Value: None <class 'NoneType'>, Expected Value: test@[example].com <class 'str'>, 
False
OrderedDict([('null_values_encountered', 2),
             ('leading_trailing_whitespace', 1),
             ('spaces_in_domain_name', 3),
             ('double_periods_in_domain_name', 1),
             ('double_at_symbols', 1),
             ('hyphens_in_domain_name', 2),
             ('invalid_email_pattern', 9),
             ('valid_emails', 9)])


In [68]:
# Apply the clean_email function to the dirty_email column
df['cleaned_email'] = df['dirty_email'].apply(clean_email)
print(df)

          dirty_phone_numbers    dirty_geo  expected_geo cleaned_geo  \
0           +1 (123) 456-7890    89.123456     89.123456   89.123456   
1                123-456-7890     -91.5678           NaN        None   
2              (111) 222 3333          xyz           NaN        None   
3             +44 1234 567890    45.678.90           NaN        None   
4            001-345-678-9012    -12.345      -12.345000     -12.345   
5                    555-5555         None           NaN        None   
6                  1234567890         None           NaN        None   
7   +1 234 567 8901 ext. 1234         None           NaN        None   
8        invalid_phone_number         None           NaN        None   
9           123-456-7890 x123         None           NaN        None   
10     234-567-8901 ext. 1234         None           NaN        None   
11            (+1) 1234567890         None           NaN        None   
12          +1 (1234) 567-890         None           NaN        

## Dealing with: Currencies

In [27]:
from scripts.clean_currency import clean_currency

In [43]:
create_dirty_and_expected_data(
    df,
    'dirty_payment',
    'expected_payment',
    [
        'foo',
        '$100.00',
        '€50,00',
        '¥5000',
        '£75.50',
        '1000 INR',
        '120.75 AUD',
        '200 CAD',
        '20.99',
        '25.50 USD',
        '30.00 EUR',
        '40 GBP',
        '45.25 JPY',
        '50.75 CNY'
    ],
    [
        'This_is_an_intentional_false_negative',
        '100.00 USD',
        '50.00 EUR',
        '5000 JPY',
        '75.50 GBP',
        '1000 INR',
        '120.75 AUD',
        '200 CAD',
        '20.99',
        '25.50 USD',
        '30.00 EUR',
        '40 GBP',
        '45.25 JPY',
        '50.75 CNY'
    ]
)

In [44]:
# Run the testing function
result = test_cleaning_report(clean_currency, df, df_report, 'dirty_payment', 'expected_payment')

Comparison Result:
(Row 0) Cleaned Value: foo <class 'str'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 


In [227]:
df['cleaned_payment'] = df['dirty_payment'].apply(clean_payment)
print(df)

   dirty_payment expected_payment cleaned_payment
0        $100.00       100.00 USD      100.00 USD
1         €50,00        50.00 EUR       50.00 EUR
2          ¥5000         5000 JPY        5000 JPY
3         £75.50        75.50 GBP       75.50 GBP
4       1000 INR         1000 INR        1000 INR
5     120.75 AUD       120.75 AUD      120.75 AUD
6        200 CAD          200 CAD         200 CAD
7          20.99            20.99           20.99
8      25.50 USD        25.50 USD       25.50 USD
9      30.00 EUR        30.00 EUR       30.00 EUR
10        40 GBP           40 GBP          40 GBP
11     45.25 JPY        45.25 JPY       45.25 JPY
12     50.75 CNY        50.75 CNY       50.75 CNY
13          None             None            None
14          None             None            None
15          None             None            None
16          None             None            None
17          None             None            None
18          None             None            None


In [228]:
def extract_currency(payment):
    if pd.isna(payment):
        return None
    
    # Define a regex pattern to match the currency code at the end of the payment string
    currency_pattern = r'\b[A-Z]{3}\b'
    
    # Extract the currency code from the payment
    match = re.search(currency_pattern, payment)
    if match:
        currency_code = match.group()
    else:
        currency_code = None
    
    return currency_code

In [229]:
df['currency'] = df['cleaned_payment'].apply(extract_currency)
print(df)

   dirty_payment expected_payment cleaned_payment currency
0        $100.00       100.00 USD      100.00 USD      USD
1         €50,00        50.00 EUR       50.00 EUR      EUR
2          ¥5000         5000 JPY        5000 JPY      JPY
3         £75.50        75.50 GBP       75.50 GBP      GBP
4       1000 INR         1000 INR        1000 INR      INR
5     120.75 AUD       120.75 AUD      120.75 AUD      AUD
6        200 CAD          200 CAD         200 CAD      CAD
7          20.99            20.99           20.99     None
8      25.50 USD        25.50 USD       25.50 USD      USD
9      30.00 EUR        30.00 EUR       30.00 EUR      EUR
10        40 GBP           40 GBP          40 GBP      GBP
11     45.25 JPY        45.25 JPY       45.25 JPY      JPY
12     50.75 CNY        50.75 CNY       50.75 CNY      CNY
13          None             None            None     None
14          None             None            None     None
15          None             None            None     No

In [230]:
def remove_currency_and_convert_to_float(payment):
    if pd.isna(payment):
        return None
    
    # Define a regex pattern to match the currency code at the end of the payment string
    currency_pattern = r'\b[A-Z]{3}\b'
    
    # Remove the currency code from the payment
    cleaned_payment = re.sub(currency_pattern, '', payment).strip()
    
    # Replace commas with dots (if any)
    cleaned_payment = cleaned_payment.replace(',', '.')
    
    # Try to convert the cleaned payment to float
    try:
        cleaned_payment_float = float(cleaned_payment)
    except ValueError:
        # Return None if conversion fails
        cleaned_payment_float = None
    
    return cleaned_payment_float

In [231]:
df['cleaned_payment_float'] = df['cleaned_payment'].apply(remove_currency_and_convert_to_float)
print(df)

   dirty_payment expected_payment cleaned_payment currency  \
0        $100.00       100.00 USD      100.00 USD      USD   
1         €50,00        50.00 EUR       50.00 EUR      EUR   
2          ¥5000         5000 JPY        5000 JPY      JPY   
3         £75.50        75.50 GBP       75.50 GBP      GBP   
4       1000 INR         1000 INR        1000 INR      INR   
5     120.75 AUD       120.75 AUD      120.75 AUD      AUD   
6        200 CAD          200 CAD         200 CAD      CAD   
7          20.99            20.99           20.99     None   
8      25.50 USD        25.50 USD       25.50 USD      USD   
9      30.00 EUR        30.00 EUR       30.00 EUR      EUR   
10        40 GBP           40 GBP          40 GBP      GBP   
11     45.25 JPY        45.25 JPY       45.25 JPY      JPY   
12     50.75 CNY        50.75 CNY       50.75 CNY      CNY   
13          None             None            None     None   
14          None             None            None     None   
15      

In [232]:
result = test_cleaning(clean_payment, df, 'dirty_payment', 'expected_payment')

# Printing the cleaning result
print(pprint.pformat(OrderedDict(cleaning_summary_currency)))

Comparison Result:
OrderedDict([('null_values_encountered', 28),
             ('leading_trailing_whitespace', 0),
             ('commas_replaced', 4),
             ('currency_symbol_replaced', 16)])


In [236]:
# Separate test-case for currency extraction.
# We slighly misuse the create_dirty_and_expected_data function, we create 2 expected result

create_dirty_and_expected_data(
    df,
    'expected_currency',
    'expected_payment_float',
    [
        'foo',
        'USD',
        'EUR',
        'JPY',
        'GBP',
        'INR',
        'AUD',
        'CAD',
        None,
        'USD',
        'EUR',
        'GBP',
        'JPY',
        'CNY'
    ],
    [
        'This_is_an_intentional_false_negative',
        100,
        50.00,
        5000,
        75.50,
        1000,
        120.75,
        200,
        20.99,
        25.50,
        30.00,
        40,
        45.25,
        50.75
    ]
)

In [237]:
result = test_cleaning(extract_currency, df, 'cleaned_payment', 'expected_currency')

Comparison Result:


In [238]:
result = test_cleaning(remove_currency_and_convert_to_float, df, 'cleaned_payment', 'expected_payment_float')

Comparison Result:
(Row 13) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 14) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 15) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 16) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 17) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 18) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 
(Row 19) Cleaned Value: nan <class 'float'>, Expected Value: nan <class 'float'>, 


## Dealing with: Boolean

In [33]:
from scripts.clean_boolean import clean_boolean

In [46]:
create_dirty_and_expected_data(
    df,
    'dirty_bool',
    'expected_bool',
    [
        'foo',
        '1',
        '0',
        'yes',
        'YES',
        'y',
        'Y',
        'true',
        'TRUE',
        'True',
        'false',
        'FALSE',
        'False',
        'no',
        'NO',
        'No',
        'n',
        'N',
        'a',
        3
    ],
    [
        'This_is_an_intentional_false_negative',
        True,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
        None,
        None
    ]
)

In [47]:
# Run the testing function
result = test_cleaning_report(clean_boolean, df, df_report, 'dirty_bool', 'expected_bool')

Comparison Result:
(Row 0) Cleaned Value: None <class 'NoneType'>, Expected Value: This_is_an_intentional_false_negative <class 'str'>, 


In [23]:
# Running the actual cleaning
df['cleaned_bool'] = df['dirty_bool'].apply(clean_boolean, df_report=df_report, dirty_column_name='dirty_bool')
print(df_report)

   dirty_dates_nulls_encountered  dirty_dates_parsing_success  \
0                              3                           10   

   dirty_dates_parsing_failed  dirty_phonenull_values_encountered  \
0                           7                                   0   

   dirty_phonevalid_phone_numbers  dirty_phoneinvalid_phone_numbers  \
0                              17                                 3   

   dirty_geo_null_values_encountered  dirty_geo_leading_trailing_whitespace  \
0                                 15                                      1   

   dirty_geo_converted_to_float  dirty_geo_ValueError --> None  ...  \
0                             3                              2  ...   

   dirty_emaildouble_at_symbols  dirty_emailhyphens_in_domain_name  \
0                             2                                  2   

   dirty_emailinvalid_email_pattern  dirty_emailvalid_emails  \
0                                16                       15   

   dirty_bool_n

## Cleaning survey salary data

In [31]:
from scripts.clean_salary import clean_salary, clean_salary_og

In [32]:
??  clean_salary_og

[1;31mSignature:[0m   [0mclean_salary_og[0m[1;33m([0m[0mx[0m[1;33m,[0m [0mdf_report[0m[1;33m,[0m [0mprefix[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0mclean_salary_og[0m[1;33m([0m[0mx[0m[1;33m,[0m [0mdf_report[0m[1;33m,[0m [0mprefix[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m""" A cleaning function for salary with reporting. 
    Fills an empty df (df_report) with statistics of what modifications this function made.
    Keeps track of which column the statistics was gathered from (prefix).
    Usage example: df['cleaned_salary'] = df['dirty_salary'].apply(clean_salary, df_report=df_report, prefix='mydf_dirty_salary')
    
    Tip: Should be used before pd.to_numeric()
    Tip: Handling none-s should be done separately, this function skips nones.
    """[0m[1;33m
[0m    [1;31m# Initialize columns if they don't exist[0m[1;33m
[0m    [0mcolumns_to_initialize[0m [1;33m=[0m [1;33m[[0m[1;33m
[0m        [

In [33]:
df_salary = pd.DataFrame(index=range(10))
df_salary_report = pd.DataFrame(index=[0])

# Creating uncleaned data and the expected data, to test our cleaning function
create_dirty_and_expected_data(
    df_salary,
    'salary',
    'expected_salary_1',
    [
        'foo',
        '1',
        1,
        '$1',
        '>1',
        'one',
        ' 1',
        '1,0',
        '1.0'
    ],
    [
        'foo',
        '1',
        1,
        '$1',
        '>1',
        'one',
        '1',
        '1.0',
        '1.0'
    ]
)

In [34]:
df_salary['converted_salary_1'] = df_salary['salary'].apply(clean_salary,df_salary_report,'dirty_salary')
print(df_salary)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [35]:
print(df_salary_report)

Empty DataFrame
Columns: []
Index: [0]


### removing special characters

In [16]:
df_salary['salary_2'] = df_salary['salary'].str.replace('$', '')
df_salary['salary_2'] = df_salary['salary'].str.replace('> ', '')

  df_salary['salary_2'] = df_salary['salary'].str.replace('$', '')


### pd.to_numeric()

In [36]:
#salary_2 = converted_salary_1

# Creating uncleaned data and the expected data, to test our cleaning function
create_dirty_and_expected_data(
    df_salary,
    'salary_2',
    'expected_salary_2',
    [
        'foo',
        1,
        '1',
        '$1',
        '>1'
        'one',
        '1',
        '1.0',
        '1.0'
    ],
    [
        None,
        1,
        1,
        1,
        1,
        None,
        1,
        1,
        1
    ]
)

In [42]:
df_salary['to_numeric_coerce'] = df_salary['salary'].apply(pd.to_numeric, errors='coerce')
print(df_salary)

  salary expected_salary_1 salary_2  expected_salary_2  converted_salary  \
0    foo               foo      foo                NaN               NaN   
1      1                 1        1                1.0               1.0   
2      1                 1        1                1.0               1.0   
3     $1                $1       $1                1.0               NaN   
4     >1                >1    >1one                1.0               NaN   
5    one               one        1                NaN               NaN   
6      1                 1      1.0                1.0               1.0   
7    1,0               1.0      1.0                1.0               NaN   
8    1.0               1.0     None                1.0               1.0   
9   None              None     None                NaN               NaN   

   to_numeric_coerce  
0                NaN  
1                1.0  
2                1.0  
3                NaN  
4                NaN  
5                NaN  
6 

In [43]:
df_salary['to_numeric_raise'] = df_salary['salary'].apply(pd.to_numeric, errors='raise')
print(df_salary)

ValueError: Unable to parse string "foo" at position 0

### Filling Nulls with 0.

# The .info() function

In [7]:
data = {
    'integers': [1, 2, 3, 4, 5],
    'mixed': [1, 2, 'a', 4, 5],
    'withnull': [1, 2, None, 4, 5]
}

df = pd.DataFrame(data)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   integers  5 non-null      int64  
 1   mixed     5 non-null      object 
 2   withnull  4 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


In [10]:
df['integers'].astype('int64')

0    1
1    2
2    3
3    4
4    5
Name: integers, dtype: int64

In [11]:
df['mixed'].astype('int64')

ValueError: invalid literal for int() with base 10: 'a'

In [12]:
df['withnull'].astype('int64')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# Null values

In [15]:
import pandas as pd
import numpy as np

# Sample DataFrame
data = {
    'salary_eur': [64000.0, 55000.0, 70000.0, None, 63000.0, 66000.0, float('inf'), 72000.0, 68000.0, 100000.0]
}
df_salary_conversion = pd.DataFrame(data)

# Drop NaN values from 'salary_eur' and assign back to the DataFrame
df_salary_conversion['salary_eur'].dropna(inplace=True)

# Check for non-finite values after dropping NaN
non_finite_values = df_salary_conversion[~df_salary_conversion['salary_eur'].apply(lambda x: pd.notnull(x) and np.isfinite(x))]
print("Non-finite values:")
print(non_finite_values)

Non-finite values:
   salary_eur
3         NaN
6         inf


In [16]:
import pandas as pd
import numpy as np

# Create a DataFrame with various types of missing values
data = {
    'A': [1, 2, None, 4, np.nan],
    'B': [5.0, 6.0, np.nan, None, 9.0],
    'C': ['foo', 'bar', None, np.nan, 'baz']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print()

# Using .isna() to check for missing values
print("Boolean mask for missing values:")
print(df.isna())
print()

# Using .dropna() to drop rows with missing values
cleaned_df = df.dropna()
print("DataFrame after dropping rows with any NaN values:")
print(cleaned_df)
print()

# Customizing .dropna() to drop columns with missing values
cleaned_columns_df = df.dropna(axis=1)
print("DataFrame after dropping columns with any NaN values:")
print(cleaned_columns_df)

Original DataFrame:
     A    B     C
0  1.0  5.0   foo
1  2.0  6.0   bar
2  NaN  NaN  None
3  4.0  NaN   NaN
4  NaN  9.0   baz

Boolean mask for missing values:
       A      B      C
0  False  False  False
1  False  False  False
2   True   True   True
3  False   True   True
4   True  False  False

DataFrame after dropping rows with any NaN values:
     A    B    C
0  1.0  5.0  foo
1  2.0  6.0  bar

DataFrame after dropping columns with any NaN values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [18]:
#Correct way of dropping None

df_it18_u = df_it18_u.dropna(subset=['salary_eur'])

NameError: name 'df_it18_u' is not defined

# Searching function

## Single keyword search

In [5]:
# Importing the function
from scripts.search_keyword import search_single_keyword, search_double_keyword

In [6]:
?? search_single_keyword

[1;31mSignature:[0m  [0msearch_single_keyword[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0msearch_column[0m[1;33m,[0m [0mnew_column[0m[1;33m,[0m [0mkeyword[0m[1;33m,[0m [0mkey_value[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0msearch_single_keyword[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0msearch_column[0m[1;33m,[0m [0mnew_column[0m[1;33m,[0m [0mkeyword[0m[1;33m,[0m [0mkey_value[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    Searches for keywords in a specified column and fills a new column with a key-value pair where there is a match.

    Parameters:
    df (pd.DataFrame): The dataframe to operate on.
    search_column (str): The name of the column to search the keywords in.
    new_column (str): The name of the new column to insert the key-value pair into.
    keyword (str): A keyword to search for.
    key_value (str): The value to insert into the new column where a keyword match is found.

    Returns:
    pd

In [7]:
df_search = pd.DataFrame(index=range(10))

# Creating uncleaned data and the expected data, to test our cleaning function
create_dirty_and_expected_data(
    df_search,
    'job_title',
    'expected_job_category',
    [
        'foo',
        'Quality Engineer',
        'Quality Maintenance Developer',
        'Quality Development Maintaner',
        'qquality'
    ],
    [
        None,
        'Q',
        'Q',
        'Q',
        'Q'
    ]
)

In [9]:
df = search_single_keyword(df_search, 'job_title', 'job_category', 'quality', 'Q')
print(df)

                       job_title expected_job_category job_category
0                            foo                  None         None
1               Quality Engineer                     Q            Q
2  Quality Maintenance Developer                     Q            Q
3  Quality Development Maintaner                     Q            Q
4                       qquality                     Q            Q
5                           None                  None         None
6                           None                  None         None
7                           None                  None         None
8                           None                  None         None
9                           None                  None         None


## Double keyword search

In [6]:
?? search_double_keyword

[1;31mSignature:[0m
 [0msearch_double_keyword[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0msearch_column[0m[1;33m,[0m[1;33m
[0m    [0mnew_column[0m[1;33m,[0m[1;33m
[0m    [0mkeyword1[0m[1;33m,[0m[1;33m
[0m    [0mkeyword2[0m[1;33m,[0m[1;33m
[0m    [0mkey_value[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0msearch_double_keyword[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0msearch_column[0m[1;33m,[0m [0mnew_column[0m[1;33m,[0m [0mkeyword1[0m[1;33m,[0m [0mkeyword2[0m[1;33m,[0m [0mkey_value[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    Searches for two keywords in a specified column and fills a new column with a key-value pair where both keywords are present.

    Parameters:
    df (pd.DataFrame): The dataframe to operate on.
    search_column (str): The name of the column to search the keywords in.
    new_column (str): The name of the new column to i

In [7]:
df_search = pd.DataFrame(index=range(10))

# Creating uncleaned data and the expected data, to test our cleaning function
create_dirty_and_expected_data(
    df_search,
    'job_title',
    'expected_job_category',
    [
        'foo',
        'Data Analyst',
        'Data Engineer',
        'Data Scientist',
        'data analyst'
    ],
    [
        None,
        'DA',
        None,
        None,
        'DA'
    ]
)

In [8]:
df = search_double_keyword(df_search, 'job_title', 'job_category', 'data','analyst', 'DA')
print(df)

        job_title expected_job_category job_category
0             foo                  None         None
1    Data Analyst                    DA           DA
2   Data Engineer                  None         None
3  Data Scientist                  None         None
4    data analyst                    DA           DA
5            None                  None         None
6            None                  None         None
7            None                  None         None
8            None                  None         None
9            None                  None         None


# Exporting the report

In [69]:
df_report_t = df_report.T

In [73]:
df_report_t.rename(columns={0: 'Occurrence'}, inplace=True)

In [75]:
df_report_t.to_csv('../results/Cleaning_report.txt', sep='\t', index=True)

In [None]:
del df_report

# Reloading a module

In [None]:
# Reloading a module
import importlib
import sys

# Add the parent directory of 'scripts' to the module search path
sys.path.append('../')

# Reload the module
importlib.reload(scripts.clean_phone)
from scripts.clean_phone import clean_phone

# Multi-level function implementation

In [None]:
# Playing with multi-level implementation of functions

import sys
sys.path.append('../')  # Add the main project directory to sys.path

from scripts.func1 import do_something

result = do_something()
print(result)