# Python Collection for Data Engineering

## File IO using Python

Writing to a file

In [1]:
# Writing to a text file
with open("example.txt", "w") as file:
    file.write("Hello, World!\n")
    file.write("Python File I/O is easy!\n")

Appending to a File

In [2]:
# Appending data to a file
with open("example.txt", "a") as file:
    file.write("This line is appended.\n")

Reading from a File

In [3]:
# Reading the entire file
with open("example.txt", "r") as file:
    content = file.read()
    print(content)


Hello, World!
Python File I/O is easy!
This line is appended.



Reading Line-by-Line

In [4]:
# Reading file line-by-line
with open("example.txt", "r") as file:
    for line in file:
        print(line.strip())  # strip() removes the newline character

Hello, World!
Python File I/O is easy!
This line is appended.


## Reading Data from a CSV File into a Python List

Reading CSV Data into a List of Lists

In [5]:
import csv

with open("data.csv", "r") as file:
    reader = csv.reader(file)
    data = list(reader)  # Convert the reader object to a list

print(data)

[['Name', 'Age', 'City'], ['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles'], ['Charlie', '35', 'Chicago']]


Skipping the Header Row

In [6]:
import csv

with open("data.csv", "r") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header
    data = list(reader)

print(data)

[['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles'], ['Charlie', '35', 'Chicago']]


## Processing Python Lists

Creating and Accessing List Elements

In [7]:
data = [['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles'], ['Charlie', '35', 'Chicago']]

# Accessing elements
print(data[0])         
print(data[1][1])      

['Alice', '30', 'New York']
25


Slicing Lists

In [8]:
# Slicing to get first two rows
print(data[:2])


[['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles']]


Adding Elements to a List

In [9]:
# Adding a new row to the data
data.append(['Dave', '40', 'Boston'])
print(data)  


[['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles'], ['Charlie', '35', 'Chicago'], ['Dave', '40', 'Boston']]


Removing Elements from a List

In [10]:
# Removing a specific row
data.remove(['Bob', '25', 'Los Angeles'])
print(data)


[['Alice', '30', 'New York'], ['Charlie', '35', 'Chicago'], ['Dave', '40', 'Boston']]


List Comprehensions for Processing Data

In [11]:
# Get only the names
names = [row[0] for row in data]
print(names)  

# Filter rows where Age > 30
older_than_30 = [row for row in data if int(row[1]) > 30]
print(older_than_30)  


['Alice', 'Charlie', 'Dave']
[['Charlie', '35', 'Chicago'], ['Dave', '40', 'Boston']]


Aggregating List Data

In [12]:
ages = [int(row[1]) for row in data]
print("Total Age:", sum(ages))        # Sum of ages
print("Average Age:", sum(ages) / len(ages))  # Average of ages


Total Age: 105
Average Age: 35.0


### Example Workflow - read CSV data as list
* Let’s combine all of these operations in a simple workflow.

In [13]:
import csv

# Read data from CSV as list reader
with open("data.csv", "r") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header
    data = list(reader)

# Process data
# 1. Add a new entry
data.append(['Eve', '28', 'Miami'])

# 2. Filter out entries where age < 30
filtered_data = [row for row in data if int(row[1]) >= 30]

# 3. Extract just the names
names = [row[0] for row in filtered_data]

print("Names:", names)
print(data)
# print one by one
for x in data:
    print(x)

Names: ['Alice', 'Charlie']
[['Alice', '30', 'New York'], ['Bob', '25', 'Los Angeles'], ['Charlie', '35', 'Chicago'], ['Eve', '28', 'Miami']]
['Alice', '30', 'New York']
['Bob', '25', 'Los Angeles']
['Charlie', '35', 'Chicago']
['Eve', '28', 'Miami']


## Lambda Functions in Python

1. Basic Lambda Function


In [14]:
add_lambda = lambda x, y: x + y

print(add_lambda(3, 5))

8


2. Lambda Function with One Argument

In [15]:
square = lambda x: x ** 2
print(square(4))

16


3. Lambda Function with No Arguments

In [16]:
greet = lambda: "Hello, World!"
print(greet())  

Hello, World!


4. Lambda in List Comprehensions

In [None]:
numbers = [1, 2, 3, 4, 5]
squared_odd_numbers = [lambda x: x**2 for x in numbers if x % 2 != 0]
print([f(3) for f in squared_odd_numbers]) 

5. Lambda for Multiple Arguments

In [22]:
# Lambda to find the maximum of three numbers
max_of_three = lambda x, y, z: max(x, y, z)
print(max_of_three(10, 20, 15))

20


## Usage of Lambda Functions
1. Using map() with Lambda
2. Using filter() with Lambda
3. Using sorted() with Lambda
4. Using reduce() with Lambda
5. Lambda Functions Inside Other Functions

In [17]:
# 1. Using map() with Lambda
numbers = [1, 2, 3, 4, 5]
squared = list(map(lambda x: x ** 2, numbers))
print(squared) 

[1, 4, 9, 16, 25]


Filter Data in Python Lists using filter and lambda

In [18]:
# 2. Using filter() with Lambda
numbers = [1, 2, 3, 4, 5, 6]
evens = list(filter(lambda x: x % 2 == 0, numbers))
print(evens)

[2, 4, 6]


In [19]:
# 3. Using sorted() with Lambda
points = [(2, 3), (1, 2), (3, 1)]
sorted_points = sorted(points, key=lambda x: x[1])
print(sorted_points)

[(3, 1), (1, 2), (2, 3)]


In [20]:
# 4. Using reduce() with Lambda
from functools import reduce

numbers = [1, 2, 3, 4]
product = reduce(lambda x, y: x * y, numbers)
print(product) 

24


In [21]:
# 5. Lambda Functions Inside Other Functions
strings = ["apple", "banana", "cherry", "date"]
sorted_strings = sorted(strings, key=lambda s: len(s))
print(sorted_strings)

['date', 'apple', 'banana', 'cherry']


## Get unique values from list using map and set

In [2]:
my_list = [1, 2, 3, 2, 1, 4, 5, 4]
unique_values = set(my_list)
print(unique_values)
print(list(unique_values))

{1, 2, 3, 4, 5}
[1, 2, 3, 4, 5]


## Sort Python lists using key

In [None]:
# sort without key
numbers = [1,5,3,2,8,5,4,9,1,3]
numbers.sort()
print(numbers)
# reverse
number = [2,9,6,4,8,3,6]
number.sort(reverse=True)
print(number)
num = [4,6,4,3,9,7,3,1]
# Sorted
print(sorted(num))
print(sorted(num,reverse=True))

[1, 1, 2, 3, 3, 4, 5, 5, 8, 9]
[9, 8, 6, 6, 4, 3, 2]
[1, 3, 3, 4, 4, 6, 7, 9]
[9, 7, 6, 4, 4, 3, 3, 1]


In [17]:
# Sort with key
my_list = [(1, 3), (4, 2), (2, 1)]

# Sort by the second element of each tuple
sorted_list = sorted(my_list, key=lambda x: x[1])
print(sorted_list)

strs = ['a','eeee','cc','dd','r']
sorted_l = sorted(strs, key=len)
print(sorted_l)

strs = ['a','AA','cc','SSS','r','AAA','Z','RER']
sorte_l = sorted(strs, key=str.lower)
print(sorte_l)

[(2, 1), (4, 2), (1, 3)]
['a', 'r', 'cc', 'dd', 'eeee']
['a', 'AA', 'AAA', 'cc', 'r', 'RER', 'SSS', 'Z']


can pass our own function as key function

In [26]:
def myFun(strs):
    return strs[-1]
#  Sorting string based on last character
strs=['av','Gc','WWa','Sb','Dz','sA','sR','Sr','GSt','gSt']
print(sorted(strs,key=myFun))

['sA', 'sR', 'WWa', 'Sb', 'Gc', 'Sr', 'GSt', 'gSt', 'av', 'Dz']


# <center>  **JSON** <center>

## Read JSON Strings to Python dicts or lists

In [None]:
import json

jsonstring = '{ "id": 121, "name": "Naveen", "course": "MERN Stack"}'
 # convert string to python dict
student_data =json.loads(jsonstring)
 
print(student_data)
print(type(student_data))
print(student_data['name'])
print(student_data['course'])



{'id': 121, 'name': 'Naveen', 'course': 'MERN Stack'}
<class 'dict'>
Naveen
MERN Stack


In [42]:
# read as dict
import json

with open ('data_list.json','r') as data_set:
    data_read = json.load(data_set)
    print(data_read)
    print(type(data_read))
    print(data_read["age"])

{'name': 'Jane Smith', 'age': 18, 'grade': 12, 'subjects': ['Physics', 'Chemistry', 'Biology', 'Computer Science'], 'marks': {'Physics': 95, 'Chemistry': 88, 'Biology': 92, 'Computer Science': 98}}
<class 'dict'>
18


In [46]:
# read as list for multiple data present in json 
# This is also example for nested json
import json

with open ('data.json','r') as data_set:
    data_read = json.load(data_set)
    print(data_read)
    print(type(data_read))
    print(data_read[1]["subjects"])


[{'name': 'John Doe', 'age': 20, 'grade': 10, 'subjects': ['Math', 'Science', 'English', 'History'], 'marks': {'Math': 85, 'Science': 92, 'English': 78, 'History': 88}}, {'name': 'Jane Smith', 'age': 18, 'grade': 12, 'subjects': ['Physics', 'Chemistry', 'Biology', 'Computer Science'], 'marks': {'Physics': 95, 'Chemistry': 88, 'Biology': 92, 'Computer Science': 98}}]
<class 'list'>
['Physics', 'Chemistry', 'Biology', 'Computer Science']


Convert python dict to JSON

In [51]:
import json

dictionary_value = {
    "id":3,
    "name":"Raja",
    "dept": ["IT","ICT"]
}
print(type(dictionary_value))
json_obj=json.dumps(dictionary_value,indent=4)
print(json_obj)
print(type(json_obj))

<class 'dict'>
{
    "id": 3,
    "name": "Raja",
    "dept": [
        "IT",
        "ICT"
    ]
}
<class 'str'>


In [52]:
import json
dict_val = {
    "name":"ravi",
    "roll": 345,
    "mail": "ravi@examplr.com"
}

with open("write_json.json",'w') as out:
    json.dump(dict_val,out)

In [59]:
# To make it more readable
import json
dict_val = '{"name":"ravi", "roll": 345,"mail": "ravi@examplr.com"}'

emp_dict = json.loads(dict_val)
print(json.dumps(emp_dict,indent=4,sort_keys=True))
# with open("write_json.json",'w') as out:
#     json.dump(dict_val,out)

{
    "mail": "ravi@examplr.com",
    "name": "ravi",
    "roll": 345
}
