In [2]:
import pandas as pd

# Sample Employee Data

data = {
    "EmpID": [1,2,3,4,5,6,7,8,9,10],
    "Name": ["John Smith","Sarah Lee","Michael Brown","Linda Taylor","David Wilson",
             "Emma Davis","James Miller","Sophia Johnson","Daniel Garcia","Olivia Martinez"],
    "Department": ["HR","IT","Finance","Marketing","IT","Finance","HR","Marketing","IT","Finance"],
    "Salary": [55000,72000,68000,62000,80000,75000,50000,64000,90000,71000],
    "Experience": [5,7,6,4,10,9,3,5,12,8]
}

df = pd.DataFrame(data)

# 1️ Create CSV File

df.to_csv("employees.csv", index=False)
print("✅ employees.csv created")

# 2️ Create Excel File

df.to_excel("employees.xlsx", sheet_name="EmployeeSheet", index=False)
print("✅ employees.xlsx created")

# 3️ Create JSON File

df.to_json("employees.json", orient="records", indent=4)
print("✅ employees.json created")





✅ employees.csv created
✅ employees.xlsx created
✅ employees.json created


In [None]:
import pandas as pd
import sqlite3
import numpy as np

# 1️ Read CSV and display first 5 rows

df_csv = pd.read_csv("employees.csv")
print("CSV Data (first 5 rows):\n", df_csv.head())


# 2️ Read Excel and extract specific columns

df_excel = pd.read_excel("employees.xlsx", sheet_name="EmployeeSheet")
df_excel_subset = df_excel[["EmpID","Name","Department"]]
print("\nExcel Subset:\n", df_excel_subset.head())


# 3️ Read JSON and display basic statistics

df_json = pd.read_json("employees.json")
print("\nJSON Data Stats:\n", df_json.describe(include='all'))


# 4️ Store records in SQLite and fetch

conn = sqlite3.connect("employees.db")
df_csv.to_sql("employee_data", conn, if_exists="replace", index=False)

df_sql = pd.read_sql("SELECT * FROM employee_data", conn)
print("\nSQL Data (first 5 rows):\n", df_sql.head())


# 5️ Merge CSV, Excel, JSON, SQL

df_merged = pd.concat([df_csv, df_excel, df_json, df_sql], ignore_index=True)

# 6️ Remove duplicates and count unique employees

df_merged = df_merged.drop_duplicates()
print("\nNumber of unique employees:", df_merged["EmpID"].nunique())


# 7️ Fill missing salaries with department average

df_merged["Salary"] = df_merged.groupby("Department")["Salary"].transform(
    lambda x: x.fillna(x.mean())
)


# 8️ Convert Salary to integer

df_merged["Salary"] = df_merged["Salary"].astype(int)

# 9️ Standardize column names

df_merged = df_merged.rename(columns={
    "EmpID":"Employee_ID",
    "Name":"Employee_Name",
    "Department":"Dept",
    "Salary":"Salary",
    "Experience":"Years_of_Experience"
})


# 10 Sort by Experience and Salary

df_sorted = df_merged.sort_values(by=["Years_of_Experience","Salary"], ascending=[False, False])
print("\nTop Employees (by experience & salary):\n", df_sorted.head())


# 1️1️ Export cleaned data to CSV

df_sorted.to_csv("cleaned_employees.csv", index=False)
print("\n✅ Cleaned data exported to cleaned_employees.csv")

# 1️2️Export to Excel with multiple sheets

with pd.ExcelWriter("cleaned_employees.xlsx") as writer:
    df_sorted.to_excel(writer, sheet_name="All_Employees", index=False)
    df_sorted[df_sorted["Dept"]=="IT"].to_excel(writer, sheet_name="IT_Employees", index=False)
    df_sorted[df_sorted["Dept"]=="HR"].to_excel(writer, sheet_name="HR_Employees", index=False)

print("✅ Cleaned data exported to cleaned_employees.xlsx with multiple sheets")

#1️3️ Efficient processing for large datasets

chunk_size = 3  # small chunk example
total_salary = 0
for chunk in pd.read_csv("employees.csv", chunksize=chunk_size):
    total_salary += chunk["Salary"].sum()
print("\nTotal Salary calculated from chunks:", total_salary)

# Check memory usage
df = pd.read_csv("employees.csv")
print("\nMemory Usage:\n", df.memory_usage(deep=True))


CSV Data (first 5 rows):
    EmpID           Name Department  Salary  Experience
0      1     John Smith         HR   55000           5
1      2      Sarah Lee         IT   72000           7
2      3  Michael Brown    Finance   68000           6
3      4   Linda Taylor  Marketing   62000           4
4      5   David Wilson         IT   80000          10

Excel Subset:
    EmpID           Name Department
0      1     John Smith         HR
1      2      Sarah Lee         IT
2      3  Michael Brown    Finance
3      4   Linda Taylor  Marketing
4      5   David Wilson         IT

JSON Data Stats:
            EmpID        Name Department        Salary  Experience
count   10.00000          10         10     10.000000    10.00000
unique       NaN          10          4           NaN         NaN
top          NaN  John Smith         IT           NaN         NaN
freq         NaN           1          3           NaN         NaN
mean     5.50000         NaN        NaN  68700.000000     6.90000
std