In [4]:
from random import randint, choice

# Map Reduce

In the part of the assignment you are requested to use Map Reduce paradigm to solve the following exercises.

**NOTE THAT**: **A solution that does not use map reduce is not valid!**

# Exercise 1

You have a list of dictionaries, each representing a student with the following properties: a name and an array of test scores. Your task is to use map, filter, and reduce to calculate the average test score for each student, and then return a list of dictionaries containing only the students whose average score is above 90.

In [None]:
students = [
    {"name": "Alice", "scores": [95, 92, 88, 100]},
    {"name": "Bob", "scores": [78, 81, 85, 80]},
    {"name": "Charlie", "scores": [99, 91, 94, 96]},
    {"name": "Diana", "scores": [85, 87, 89, 83]}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"name": "Alice", "average_score": 93.75},
    {"name": "Charlie", "average_score": 95.0}
]

[{'name': 'Alice', 'average_score': 93.75},
 {'name': 'Charlie', 'average_score': 95.0}]

### Test
Test your solution using the dataset generated by the following function.

In [56]:
def generate_random_student_dataset(num_students=50):
    names = [f"Student {i}" for i in range(1, num_students + 1)]
    dataset = [
        {
            "name": name,
            "scores": [randint(50, 100) for _ in range(randint(3, 6))]  # Random scores between 50 and 100
        }
        for name in names
    ]
    return dataset

random_student_dataset = generate_random_student_dataset(50)
random_student_dataset[:3]

[{'name': 'Student 1', 'scores': [83, 85, 67]},
 {'name': 'Student 2', 'scores': [83, 100, 85, 100, 94]},
 {'name': 'Student 3', 'scores': [100, 61, 75]}]

In [64]:
# your code goes here

from functools import reduce


def avg(student):
    scores = student['scores']
    avg_score = reduce(lambda x, y: x + y, scores) / len(scores)
    return {"name": student["name"], "average_score": avg_score}

avgs = list(map(avg, random_student_dataset))
up90 = list(filter(lambda student: student['average_score'] > 90, avgs))

print(*up90, sep = "\n")
    

{'name': 'Student 2', 'average_score': 92.4}
{'name': 'Student 16', 'average_score': 96.66666666666667}


## Exercise 2

You have a list of dictionaries, each representing a product with the following properties: name, price, and category. Using the functions `map`, `filter`, and `reduce`, calculate the average price of the products in each category and return a list of dictionaries containing only the categories where the average price exceeds 50.

Example input:

In [None]:
products = [
    {"name": "Product A", "price": 60, "category": "Electronics"},
    {"name": "Product B", "price": 40, "category": "Electronics"},
    {"name": "Product C", "price": 70, "category": "Home"},
    {"name": "Product D", "price": 30, "category": "Home"},
    {"name": "Product E", "price": 90, "category": "Sports"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"category": "Electronics", "average_price": 50.0},
    {"category": "Sports", "average_price": 90.0}
]

[{'category': 'Electronics', 'average_price': 50.0},
 {'category': 'Sports', 'average_price': 90.0}]

### Test
Test your solution using the dataset generated by the following function.

In [65]:
def generate_random_product_dataset(num_products=100):
    categories = ["Electronics", "Home", "Sports", "Books", "Clothing", "Toys"]
    dataset = [
        {
            "name": f"Product {i}",
            "price": randint(10, 200),  # Random price between 10 and 200
            "category": choice(categories),  # Randomly choose a category
        }
        for i in range(1, num_products + 1)
    ]
    return dataset

# Example of using the function
random_dataset = generate_random_product_dataset(100)
random_dataset[:5]  # Display the first 5 entries to check the dataset structure


[{'name': 'Product 1', 'price': 53, 'category': 'Sports'},
 {'name': 'Product 2', 'price': 73, 'category': 'Clothing'},
 {'name': 'Product 3', 'price': 167, 'category': 'Books'},
 {'name': 'Product 4', 'price': 177, 'category': 'Electronics'},
 {'name': 'Product 5', 'price': 76, 'category': 'Sports'}]

In [69]:
# your code goes here
# hints: 1) Group products by category (you don't need to use map reduce for this part), then 2) use map reduce paradigm to
# calculate the average price for each category and filter categories with an average price > 50

from collections import defaultdict


categ = defaultdict(lambda: {'tot': 0, 'count': 0})
for product in random_dataset:
    categ[product['category']]['tot'] += product['price']
    categ[product['category']]['count'] += 1

avgp = list(map(lambda category: {'category': category,'average_price': round(categ[category]['tot'] / categ[category]['count'], 2) }, categ.keys()))
filtcag = list(filter(lambda x: x['average_price'] > 50, avgp))
print(*filtcag, sep= "\n")


{'category': 'Sports', 'average_price': 94.19}
{'category': 'Clothing', 'average_price': 66.33}
{'category': 'Books', 'average_price': 112.09}
{'category': 'Electronics', 'average_price': 96.48}
{'category': 'Toys', 'average_price': 101.29}
{'category': 'Home', 'average_price': 99.85}


# Exercise 3

You have a list of dictionaries, each representing an employee with the following properties: name, salary, and department. Your task is to use `map`, `filter`, and `reduce` to calculate the average salary for each department and return a list of dictionaries containing only the departments where the average salary is above 65,000.

**Example Input**

In [5]:
employees = [
    {"name": "John", "salary": 70000, "department": "Engineering"},
    {"name": "Jane", "salary": 75000, "department": "Engineering"},
    {"name": "Alice", "salary": 60000, "department": "HR"},
    {"name": "Bob", "salary": 68000, "department": "HR"},
    {"name": "Charlie", "salary": 90000, "department": "Marketing"},
    {"name": "Diana", "salary": 50000, "department": "Marketing"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"department": "Engineering", "average_salary": 72500.0},
    {"department": "Marketing", "average_salary": 70000.0}
]

[{'department': 'Engineering', 'average_salary': 72500.0},
 {'department': 'Marketing', 'average_salary': 70000.0}]

### Test

Test your solution using the dataset generated by the following function.

In [71]:
def generate_random_employee_dataset(num_employees=50):
    departments = ["Engineering", "HR", "Marketing", "Sales", "Finance", "IT"]
    dataset = [
        {
            "name": f"Employee {i}",
            "salary": randint(40000, 120000),  # Random salary between 40,000 and 120,000
            "department": choice(departments)  # Randomly choose a department
        }
        for i in range(1, num_employees + 1)
    ]
    return dataset

random_employee_dataset = generate_random_employee_dataset(50)

random_employee_dataset[:3]  # Display the first 3 entries of each dataset for checking


[{'name': 'Employee 1', 'salary': 52672, 'department': 'Finance'},
 {'name': 'Employee 2', 'salary': 112732, 'department': 'IT'},
 {'name': 'Employee 3', 'salary': 95609, 'department': 'Marketing'}]

In [74]:
# your code goes here
# hints: 1) Group employees' salaries by department (you don't need to use map reduce for this part), then 2) use map reduce paradigm to
# calculate the average salary for each department and filter departments with an average salary > threshold

from collections import defaultdict


categ = defaultdict(lambda: {'tot': 0, 'count': 0})
for sal in random_employee_dataset:
    categ[sal['department']]['tot'] += sal['salary']
    categ[sal['department']]['count'] += 1

avgp = list(map(lambda department: {'department': department,'average_salary': round(categ[department]['tot'] / categ[department]['count'], 2) }, categ.keys()))
filtcag = list(filter(lambda x: x['average_salary'] > 65000, avgp))
print(*filtcag, sep= "\n")

{'department': 'Finance', 'average_salary': 90905.5}
{'department': 'IT', 'average_salary': 91148.9}
{'department': 'Marketing', 'average_salary': 94724.33}
{'department': 'Engineering', 'average_salary': 83860.0}
{'department': 'HR', 'average_salary': 84892.11}
{'department': 'Sales', 'average_salary': 71395.12}


# Biopython

Write the following five functions to analyze global alignments between two sequences using Biopython's `pairwise2` module:

1. **countMatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment (pairwise2.globalxx) of the same length. It returns the number of positions where the elements of both sequences match.

2. **countMismatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of positions where the elements of the two sequences are different (i.e., they are not gaps, and the characters do not match).

3. **countGapOpens(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap openings in the alignment (a gap is opened when a '-' appears in the sequence).

4. **countGapExtensions(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap extensions (where '-' continues in the alignment after an initial gap is opened).

5. **getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment and returns the alignment score based on the provided scoring scheme: `matchScore` for matches, `mismatchPenalty` for mismatches, `gapOpenPenalty` for opening a gap, and `gapExtensionPenalty` for extending a gap.

In [None]:
# Add your functions here


from Bio import pairwise2

#1
def countMatches(s1, s2):
    matches = sum(1 for a, b in zip(s1, s2) if a == b)
    return matches

#2
def countMismatches(s1, s2):
    mismatches = sum(1 for a, b in zip(s1, s2) if a != b and a!= '-' and b != '-')
    return mismatches

#3
def countGapOpens(s1, s2):
    gapopens = 0
    ingap = False
    for a, b in zip(s1, s2):
        if a == '-' and b == '-':
            if not ingap:
                gapopens += 1
                ingap = True
        else:
            ingap = False
    return gapopens

#4
def countGapExtensions(s1, s2):
    gapextensions = 0
    ingap = False
    for a, b in zip(s1, s2):
        if a == '-' or b == '-':
            if ingap:
                gapextensions += 1
            else:
                ingap = True
        else:
            ingap = False
    return gapextensions

#5
def getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty):
    score = 0
    ingap = False
    for a, b in zip(s1, s2):
        if a == '-' or b == '-':
            if ingap:
                score -= gapExtensionPenalty
            else:
                score -= gapOpenPenalty
                ingap = True
        else:
            ingap = False
            if a == b:
                score += matchScore
            else:
                score -= mismatchPenalty
    return score




### Test
Align the sequences of the [Interleukin-12](https://en.wikipedia.org/wiki/Interleukin_12) chain A (denoted as `s1`) from the file [`IL12A.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12A.fasta) and the Interleukin-12 chain B (denoted as `s2`) from the file [`IL12B.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12B.fasta) and check the score as computed from pairwise2 and from your functions.

In [81]:
# add the output of the test here

from Bio import pairwise2
from Bio import SeqIO

def getSequenceData(filename):
    record = SeqIO.read(filename, "fasta")
    return record.seq
s1 = getSequenceData("IL12A.fasta")
s2 = getSequenceData("IL12B.fasta")

align = pairwise2.align.globalxx(s1, s2)
best = align[0]
aligns1 = best[0]
aligns2 = best[1]

matchScore = 1
mismatchPenalty = -1
gapOpenPenalty = -2
gapExtensionPenalty = -1

pairwise2score = best[2]

score = getScore(aligns1, aligns2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty)
print(aligns1)
print(aligns2)
print(pairwise2score)
print(score)

MCPAR------S---L--L---LVAT---L---VL----LDHLSL----ARNLP---VA--TP-D-P---GMFPC----LHHS-QNLLRAVSNM---LQ---KARQTL-----EF-----YPCTSE-----EID---H--------ED-I--T---KD-KTSTVEACLP-----L--ELT-KNE-S----C--LNSRETSF-I-TN----------GS-------CL-A---S--R-----KT----SFMMAL--CL---S-------S--IYE---D----LKMYQVE-----F------KTMNA----K-L-LMD-P-K--RQIFLDQNMLAVIDELMQALNFN-S-E---TV---PQK-S--SLE------------EP--D--FYKT-K-----IKLCILLHAFR----I--RAVTI-DRVMSYLN--------AS----
MC---HQQLVISWFSLVFLASPLVA-IWELKKDV-YVVELD----WYPDA---PGEMV-VLT-CDTPEEDG----ITWTL---DQ------S--SEVL-GSGK---TLTIQVKEFGDAGQY--T--CHKGGE--VLSHSLLLLHKKEDGIWSTDILKDQK----E---PKNKTFLRCE--AKN-YSGRFTCWWL----T--TIST-DLTFSVKSSRGSSDPQGVTC-GAATLSAERVRGDNK-EYEYS-----VEC-QEDSACPAAEESLPI-EVMVDAVHKLK-Y--ENYTSSFFIRDIIK----PDPPKNLQL--KPLKNSR-----Q----V--E--------VSWEYPDT-WSTP--HSYFSL-TFCVQVQGKSKRE-KKDRVF--TDKTSATVI--C------RKNASISVRA---QDR---Y--YSSSWSEWASVPCS
103.0
532
