In [2]:
import pandas as pd
import json
import xml.etree.ElementTree as ET
import os

def clean_data(df):
    #Displaying the first 5 rows
    df.head()
    #Displaying the last 5 rows
    df.tail()
    # Drop rows with missing values
    df = df.dropna()
    # Drop duplicate rows
    df = df.drop_duplicates()
    
    return df

def read_csv(file_path):
    df = pd.read_csv(file_path)
    return clean_data(df)

def read_excel(file_path):
    df = pd.read_excel(file_path)
    return clean_data(df)

def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    df = pd.json_normalize(data)
    return clean_data(df)

def read_plain_text(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    df = pd.DataFrame(lines, columns=['text'])
    return clean_data(df)

def read_xml(file_path):
    with open(file_path, 'r') as file:
        data_dict = xmltodict.parse(file.read())
    df = pd.json_normalize(data_dict)
    return clean_data(df)

def read_pdf(file_path):
    reader = PyPDF2.PdfFileReader(file_path)
    text = []
    for page in range(reader.getNumPages()):
        text.append(reader.getPage(page).extract_text())
    df = pd.DataFrame(text, columns=['text'])
    return clean_data(df)

def process_file(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension == '.csv':
        return read_csv(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return read_excel(file_path)
    elif file_extension == '.json':
        return read_json(file_path)
    elif file_extension == '.txt':
        return read_plain_text(file_path)
    elif file_extension == '.xml':
        return read_xml(file_path)
    elif file_extension == '.pdf':
        return read_pdf(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

# Example usage
file_path = 'Retail Transactions.csv'
cleaned_data = process_file(file_path)
cleaned_data
print(cleaned_data.info())
print(cleaned_data.shape)
print(cleaned_data.head())
print(cleaned_data.tail())

<class 'pandas.core.frame.DataFrame'>
Index: 12124 entries, 3 to 75610
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transaction_date  12124 non-null  object 
 1   transaction_hour  12124 non-null  object 
 2   location_state    12124 non-null  object 
 3   location_city     12124 non-null  object 
 4   rewards_number    12124 non-null  object 
 5   rewards_member    12124 non-null  object 
 6   num_of_items      12124 non-null  int64  
 7   coupon_flag       12124 non-null  object 
 8   discount_amt      12124 non-null  float64
 9   order_amt         12124 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.0+ MB
None
(12124, 10)
   transaction_date transaction_hour  location_state    location_city  \
3         7/22/2020          4:56 PM         Florida         Sarasota   
8          3/6/2020          8:48 PM         Georgia    Lawrenceville   
10         2/7/2020          8:09 