In [None]:
import pandas as pd
from datetime import datetime
import re

def readFile(filePath, fileFormat):
    try:
        if fileFormat == 'csv':
            return pd.read_csv(filePath)
        elif fileFormat == 'tsv':
            return pd.read_csv(filePath, sep='\t')
        elif fileFormat == 'xlsx':
            return pd.read_excel(filePath)
        elif fileFormat == 'json':
            return pd.read_json(filePath)
        else:
            raise ValueError(f"Unsupported file format: {fileFormat}")
    
    except FileNotFoundError:
        print(f"Error: The file '{filePath}' was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{filePath}' is empty.")
    except pd.errors.ParserError:
        print(f"Error: The file '{filePath}' could not be parsed.")
    except ValueError as ve:
        print(f"Error: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def removeDuplicates(dataFrame):
    return dataFrame.drop_duplicates()

In [None]:
def transformDate(df, date_columns, dateFormat='MM-DD-YY', dateType='numeric', abbreviated=False):
    """date_columns (list): list of columns containing date information
       dateFormat (str): the desired date format (e.g., 'MM-DD-YY', 'DD-MM-YY')
       dateType (str): whether the date should be 'numeric' or 'string'
       abbreviated (bool): whether the month name should be abbreviated or not"""
    
    format_mappings = {
        'MM-DD-YY': '%m-%d-%y',
        'DD-MM-YY': '%d-%m-%y',
        'YYYY-MM-DD': '%Y-%m-%d'
    }
    
    if dateFormat not in format_mappings:
        raise ValueError(f"Unsupported date format: {dateFormat}")

    def format_date(value):
        if pd.isna(value): # keep NaN as it is
            return value
        date_obj = pd.to_datetime(value, errors='coerce')

        if pd.isna(date_obj):
            return value
        
        if dateType == 'numeric':
            return date_obj.strftime(format_mappings[dateFormat])
        elif dateType == 'string':
            if abbreviated:
                return date_obj.strftime('%b %d, %Y') 
            else:
                return date_obj.strftime('%B %d, %Y')
        else:
            raise ValueError(f"Unsupported date type: {dateType}")

    for col in date_columns:
        df[col] = df[col].apply(format_date)
    
    return df

In [4]:
def fixEmailFormat(df, email_columns, strict=False):
    """email_columns (list): list of columns containing email addresses to be fixed.
       strict (bool): if True, only keep rows with valid email addresses. If False, attempt to fix."""
    
    def clean_email(email):
        if pd.isna(email):
            return email  
        email = email.strip().lower()

        if re.match(r"[^@]+@[^@]+\.[a-zA-Z]{2,}", email):
            return email
        else:
            if strict:
                return None
            else:
                email = email.replace(' ', '')
                if re.match(r"[^@]+@[^@]+\.[a-zA-Z]{2,}", email):
                    return email
                else:
                    return None

    for col in email_columns:
        df[col] = df[col].apply(clean_email)
    
    if strict:
        df.dropna(subset=email_columns, inplace=True)
    
    return df

In [3]:
def main():
    file_path = 'data.csv' 
    file_format = 'csv'     # can be 'csv', 'tsv', 'xlsx', 'json', etc
    df = readFile(file_path, file_format)
    
    if df is None:
        return

    while True: 
        print("\nChoose an operation to perform:")
        print("1. Remove Duplicates")
        print("2. Transform Date Format")
        print("3. Fix Email Format")
        print("4. Exit")
        choice = input("Enter the number of the operation (1, 2, 3, or 4): ")
        
        if choice == '1':
            df = removeDuplicates(df)
            print("\nDuplicates removed:")
            print(df.head())
            
        elif choice == '2':
            date_columns = ['Date'] 
            dateFormat = input("Enter date format (e.g., 'MM-DD-YY', 'DD-MM-YY', 'YYYY-MM-DD'): ")
            dateType = input("Enter date type ('numeric' or 'string'): ")
            abbreviated = input("Abbreviate month names? (yes or no): ").lower() == 'yes'
            
            df = transformDate(df, date_columns, dateFormat=dateFormat, dateType=dateType, abbreviated=abbreviated)
            print("\nDate format transformed:")
            print(df.head())
            
        elif choice == '3':
            email_columns = ['Email']
            strict_mode = input("Strict mode? (yes or no): ").lower() == 'yes'
            
            df = fixEmailFormat(df, email_columns, strict=strict_mode)
            print("\nEmail format fixed:")
            print(df.head())
        
        elif choice == '4':
            print("Exiting the program.")
            break
        
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")
        
        another = input("\nDo you want to perform another operation? (yes or no): ").lower()
        if another != 'yes':
            print("Exiting the program.")
            break

main()



Choose an operation to perform:
1. Remove Duplicates
2. Transform Date Format
3. Fix Email Format
4. Exit

Duplicates removed:
      Name        Date      City                 Email
0    Alice  2024-10-16  New York    alice@example.com 
2      Bob  2023-05-23    London         BOB@GMAIL.COM
3  Charles  2024-01-02     Paris       charles@invalid
4    David  2022-12-10    Berlin    david@@example.com
5      Eve  2023-07-19     Tokyo        eve@domain.co 
Exiting the program.
