<a href="https://colab.research.google.com/github/Subramaniya-pillai/data_engineering/blob/main/python_coding_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Section 0: Create CSV files
import pandas as pd


In [8]:
# Create employees.csv
df_employees = pd.DataFrame({
    "EmployeeID": [1, 2, 3, 4, 5],
    "Name": ["Ali", "Neha", "Ravi", "Sara", "Vikram"],
    "Department": ["HR", "IT", "Finance", "IT", "HR"],
    "Salary": [50000, 60000, 55000, 70000, 52000],
    "JoiningDate": ["2021-03-15", "2022-01-10", "2020-07-23", "2023-05-19", "2022-09-30"]
})

df_employees.to_csv("employees.csv", index=False)

In [9]:
# Create projects.csv
df_projects = pd.DataFrame({
    "ProjectID": [101, 102, 103, 104],
    "EmployeeID": [2, 3, 3, 5],
    "ProjectName": ["AI Chatbot", "ERP System", "Payroll Automation", "Cloud Migration"],
    "HoursAllocated": [120, 200, 150, 100]
})
df_projects.to_csv("projects.csv", index=False)

In [10]:
# --- Section 1: Python Basics & Control Flow ---
# Q1. Print all odd numbers between 10 and 50
print("Q1:", [i for i in range(10, 51) if i % 2 != 0])

Q1: [11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49]


In [11]:

# Q2. Function to check if a year is a leap year
def is_leap_year(year):
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
print("Q2 (2024):", is_leap_year(2024))


Q2 (2024): True


In [12]:
# Q3. Count 'a' in a string
def count_a(s):
    return s.count('a')
print("Q3 ('banana'):", count_a("banana"))

Q3 ('banana'): 3


#  Section 2: Collections




In [13]:

# Q4. Create a dictionary from two lists
keys = ['a', 'b', 'c']
values = [100, 200, 300]
dict_from_lists = dict(zip(keys, values))
print("Q4:", dict_from_lists)

Q4: {'a': 100, 'b': 200, 'c': 300}


In [14]:
# Q5. Salary analysis
salaries = [50000, 60000, 55000, 70000, 52000]
max_salary = max(salaries)
avg_salary = sum(salaries) / len(salaries)
salaries_above_avg = [s for s in salaries if s > avg_salary]
sorted_desc = sorted(salaries, reverse=True)
print("Q5: Max:", max_salary, "Above Avg:", salaries_above_avg, "Sorted:", sorted_desc)


Q5: Max: 70000 Above Avg: [60000, 70000] Sorted: [70000, 60000, 55000, 52000, 50000]


In [15]:
# Q6. Sets and difference
a = [1, 2, 3, 4]
b = [3, 4, 5, 6]
set_a = set(a)
set_b = set(b)
diff_ab = set_a - set_b
print("Q6: Difference:", diff_ab)

Q6: Difference: {1, 2}


#  Section 3: Functions & Classes


In [17]:
# Q7. Employee class
class Employee:
    def __init__(self, emp_id, name, department, salary):
        self.emp_id = emp_id
        self.name = name
        self.department = department
        self.salary = salary

    def display(self):
        print(f"{self.name} ({self.emp_id}) - {self.department} - {self.salary}")

    def is_high_earner(self):
        return self.salary > 60000

In [18]:
# Q8. Project class inheriting Employee
class Project(Employee):
    def __init__(self, emp_id, name, department, salary, project_name, hours_allocated):
        super().__init__(emp_id, name, department, salary)
        self.project_name = project_name
        self.hours_allocated = hours_allocated


In [19]:
# Q9. Instantiate and check earners
e1 = Employee(1, "Ali", "HR", 50000)
e2 = Employee(2, "Neha", "IT", 60000)
e3 = Employee(4, "Sara", "IT", 70000)
print("Q9:")
print(e1.is_high_earner())
print(e2.is_high_earner())
print(e3.is_high_earner())


Q9:
False
False
True


#  Section 4: File Handling


In [20]:
# Q10. Write IT employee names to file
it_employees = df_employees[df_employees['Department'] == 'IT']
it_employees['Name'].to_csv("it_employees.txt", index=False, header=False)
print("Q10: Written to it_employees.txt")

Q10: Written to it_employees.txt


In [21]:
# Q11. Read file and count words
with open("it_employees.txt", "r") as file:
    text = file.read()
    word_count = len(text.split())
print("Q11: Word count:", word_count)


Q11: Word count: 2


# Section 5: Exception Handling

In [23]:
# Q12. Input with exception handling
try:
    num = float(input("Q12: Enter a number: "))
    print("Square:", num**2)
except ValueError:
    print("Invalid input! Not a number.")

Q12: Enter a number: 5
Square: 25.0


In [24]:
# Q13. Safe division
def safe_divide(a, b):
    try:
        return a / b
    except ZeroDivisionError:
        return "Cannot divide by zero"
print("Q13:", safe_divide(10, 0))

Q13: Cannot divide by zero


Section 6: Pandas – CSVs

In [25]:
# Q14. Load CSVs
employees = pd.read_csv("employees.csv")
projects = pd.read_csv("projects.csv")
print("Q14: Loaded CSVs")

Q14: Loaded CSVs


In [26]:
# Q15. Display first 2 rows, unique departments, avg salary
print("Q15:")
print(employees.head(2))
print(employees['Department'].unique())
print(employees.groupby('Department')['Salary'].mean())

Q15:
   EmployeeID  Name Department  Salary JoiningDate
0           1   Ali         HR   50000  2021-03-15
1           2  Neha         IT   60000  2022-01-10
['HR' 'IT' 'Finance']
Department
Finance    55000.0
HR         51000.0
IT         65000.0
Name: Salary, dtype: float64


In [27]:
# Q16. Add TenureInYears
from datetime import datetime
current_year = datetime.now().year
employees['TenureInYears'] = employees['JoiningDate'].apply(lambda x: current_year - int(x[:4]))
print("Q16: Added TenureInYears")

Q16: Added TenureInYears


Section 7: Filtering, Aggregation, Sorting

In [28]:
# Q17. Filter IT with salary > 60000
filtered = employees[(employees['Department'] == 'IT') & (employees['Salary'] > 60000)]
print("Q17:", filtered)

Q17:    EmployeeID  Name Department  Salary JoiningDate  TenureInYears
3           4  Sara         IT   70000  2023-05-19              2


In [29]:
# Q18. Group by Department
grouped = employees.groupby('Department')['Salary'].agg(['count', 'sum', 'mean'])
print("Q18:", grouped)

Q18:             count     sum     mean
Department                        
Finance         1   55000  55000.0
HR              2  102000  51000.0
IT              2  130000  65000.0


In [30]:
# Q19. Sort by salary desc
sorted_employees = employees.sort_values(by='Salary', ascending=False)
print("Q19:", sorted_employees)


Q19:    EmployeeID    Name Department  Salary JoiningDate  TenureInYears
3           4    Sara         IT   70000  2023-05-19              2
1           2    Neha         IT   60000  2022-01-10              3
2           3    Ravi    Finance   55000  2020-07-23              5
4           5  Vikram         HR   52000  2022-09-30              3
0           1     Ali         HR   50000  2021-03-15              4


Section 8: Joins & Merging

In [31]:
# Q20. Merge for project allocations
merged = pd.merge(employees, projects, on='EmployeeID', how='inner')
print("Q20:", merged)

Q20:    EmployeeID    Name Department  Salary JoiningDate  TenureInYears  \
0           2    Neha         IT   60000  2022-01-10              3   
1           3    Ravi    Finance   55000  2020-07-23              5   
2           3    Ravi    Finance   55000  2020-07-23              5   
3           5  Vikram         HR   52000  2022-09-30              3   

   ProjectID         ProjectName  HoursAllocated  
0        101          AI Chatbot             120  
1        102          ERP System             200  
2        103  Payroll Automation             150  
3        104     Cloud Migration             100  


In [32]:
# Q21. Left join to find no-project employees
left_join = pd.merge(employees, projects, on='EmployeeID', how='left')
no_project = left_join[left_join['ProjectID'].isna()]
print("Q21:", no_project)

Q21:    EmployeeID  Name Department  Salary JoiningDate  TenureInYears  ProjectID  \
0           1   Ali         HR   50000  2021-03-15              4        NaN   
4           4  Sara         IT   70000  2023-05-19              2        NaN   

  ProjectName  HoursAllocated  
0         NaN             NaN  
4         NaN             NaN  


In [33]:

# Q22. Add TotalCost = HoursAllocated * (Salary / 160)
merged['TotalCost'] = merged['HoursAllocated'] * (merged['Salary'] / 160)
print("Q22:", merged[['EmployeeID', 'ProjectName', 'TotalCost']])


Q22:    EmployeeID         ProjectName  TotalCost
0           2          AI Chatbot    45000.0
1           3          ERP System    68750.0
2           3  Payroll Automation    51562.5
3           5     Cloud Migration    32500.0
