In [None]:
import pandas as pd
import numpy as np
import io

# --- 1. Simulate File Data ---
# In a real-world scenario, you would replace the io.StringIO/io.BytesIO
# objects with your actual file paths (e.g., 'sales.csv', 'data.xlsx')

# Sample CSV data
csv_data = """Date,ProductID,Amount,Region
2024-01-01,P100,150,North
2024-01-02,P101,200,South
2024-01-01,P100,150,North
2024-01-04,P102,500,West
"""

# Sample Excel data
excel_data = {
    'OrderDate': ['2024-01-05', '2024-01-06', '2024-01-07'],
    'ItemCode': ['P103', 'P102', 'P104'],
    'Sales': [300, 500, np.nan], # Missing value
    'Area': ['East', 'West', 'South']
}
# Create an in-memory Excel file
excel_file = io.BytesIO()
pd.DataFrame(excel_data).to_excel(excel_file, index=False, engine='openpyxl')
excel_file.seek(0) # Rewind the file to the beginning

# Sample JSON data (record-oriented)
json_data = """
[
  {"date": "2024-01-08", "product_id": "P101", "total": 200, "location": "South"},
  {"date": "2024-01-09", "product_id": "P105", "total": 120, "location": "North"}
]
"""

# --- 2. Load Data into DataFrames ---

print("--- 2. Loading Data ---")
# Load CSV
df_csv = pd.read_csv(io.StringIO(csv_data))
# Load Excel
df_excel = pd.read_excel(excel_file, engine='openpyxl')
# Load JSON
df_json = pd.read_json(io.StringIO(json_data))

print("Initial CSV Data:\n", df_csv.head())
print("\nInitial Excel Data:\n", df_excel.head())
print("\nInitial JSON Data:\n", df_json.head())
print("-" * 30)


# --- 3. Explore and Identify Issues ---

print("\n--- 3. Exploring Data ---")
print("\nCSV Info:")
df_csv.info()
print("\nExcel Info:")
df_excel.info()
print("\nJSON Info:")
df_json.info()

print("\nIdentified Issues:")
print("1. Inconsistent column names (e.g., 'Amount', 'Sales', 'total')")
print("2. Inconsistent column names (e.g., 'Region', 'Area', 'location')")
print("3. Missing values found in Excel data:")
print(df_excel.isnull().sum())
print("4. Duplicate rows found in CSV data.")
print("5. Date columns are 'object' type, not 'datetime'.")
print("-" * 30)


# --- 4. Clean and Standardize Data ---

print("\n--- 4. Cleaning and Standardizing ---")

# --- 4a. Standardize CSV Data ---
df_csv_clean = df_csv.rename(columns={
    'Date': 'Date',
    'ProductID': 'ProductID',
    'Amount': 'Amount',
    'Region': 'Region'
})
df_csv_clean['Date'] = pd.to_datetime(df_csv_clean['Date'])
df_csv_clean = df_csv_clean.drop_duplicates()
print("CSV data cleaned.")

# --- 4b. Standardize Excel Data ---
df_excel_clean = df_excel.rename(columns={
    'OrderDate': 'Date',
    'ItemCode': 'ProductID',
    'Sales': 'Amount',
    'Area': 'Region'
})
df_excel_clean['Date'] = pd.to_datetime(df_excel_clean['Date'])
# Handle missing values (dropping the row)
df_excel_clean = df_excel_clean.dropna(subset=['Amount'])
# Convert Amount to integer (it was float due to NaN)
# df_excel_clean['Amount'] = df_excel_clean['Amount'].astype(int)
print("Excel data cleaned.")

# --- 4c. Standardize JSON Data ---
df_json_clean = df_json.rename(columns={
    'date': 'Date',
    'product_id': 'ProductID',
    'total': 'Amount',
    'location': 'Region'
})
df_json_clean['Date'] = pd.to_datetime(df_json_clean['Date'])
print("JSON data cleaned.")
print("-" * 30)


# --- 5. Unify Data into a Single DataFrame ---

print("\n--- 5. Unifying Data ---")
# Concatenate the clean DataFrames
unified_sales_df = pd.concat(
    [df_csv_clean, df_excel_clean, df_json_clean],
    ignore_index=True  # Resets the index for the new DataFrame
)

# print("\n--- üî• Final Unified DataFrame üî• ---")
# print(unified_sales_df.to_string()) # .to_string() prints all rows

print("\n--- Final Unified Info ---")
unified_sales_df.info()

print("\n--- Final Unified Head ---")
print(unified_sales_df.head(9))

print("\n--- Final Descriptive Statistics ---")
print(unified_sales_df.describe())

--- 2. Loading Data ---
Initial CSV Data:
          Date ProductID  Amount Region
0  2024-01-01      P100     150  North
1  2024-01-02      P101     200  South
2  2024-01-01      P100     150  North
3  2024-01-04      P102     500   West

Initial Excel Data:
     OrderDate ItemCode  Sales   Area
0  2024-01-05     P103  300.0   East
1  2024-01-06     P102  500.0   West
2  2024-01-07     P104    NaN  South

Initial JSON Data:
         date product_id  total location
0 2024-01-08       P101    200    South
1 2024-01-09       P105    120    North
------------------------------

--- 3. Exploring Data ---

CSV Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       4 non-null      object
 1   ProductID  4 non-null      object
 2   Amount     4 non-null      int64 
 3   Region     4 non-null      object
dtypes: int64(1), object(3)
memory usage: 260.0+

In [None]:
pip install openpyxl
import pandas as pd
import numpy as np
# Note: The 'io' import is no longer needed when reading from disk.

# ------------------------------------------------------------------
# 1. DEFINE FILE PATHS
#    (Adjust these paths if your files are not in the same folder)
# ------------------------------------------------------------------
CSV_PATH = "sales_data.csv"
EXCEL_PATH = "sales_data.xlsx"
JSON_PATH = "sales_data.json"

# --- 2. LOAD DATA INTO DATAFRAMES FROM EXTERNAL FILES ---

print("--- 2. Loading Data from External Files ---")
try:
    # Load CSV using its file path
    df_csv = pd.read_csv(CSV_PATH)

    # Load Excel using its file path (requires 'openpyxl' engine)
    df_excel = pd.read_excel(EXCEL_PATH, engine='openpyxl')

    # Load JSON using its file path
    df_json = pd.read_json(JSON_PATH)

    print(f"Successfully loaded data from {CSV_PATH}, {EXCEL_PATH}, and {JSON_PATH}")

    # Display a preview of the loaded data
    print("\nInitial CSV Data Head:\n", df_csv.head())

except FileNotFoundError as e:
    print(f"\n‚ùå ERROR: File not found. Please ensure the file '{e.filename}' exists in the correct path.")
    # Exit gracefully or handle the error as needed
    df_csv, df_excel, df_json = None, None, None

# ------------------------------------------------------------------
# The rest of your cleaning and unifying pipeline (Tasks 3, 4, 5)
# would follow here, using df_csv, df_excel, and df_json.
# ------------------------------------------------------------------

if df_csv is not None:
    print("\nReady for Cleaning and Unification (Tasks 3-5).")


Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
