**Research for How to Implement Solution**:

On Wikipedia, I found lots of good models to figure out where to start, mainly [Angle of View](https://en.wikipedia.org/wiki/Angle_of_view). 

First, I need to find the distance of the lens to the object. This can by found by dividing the Y dimension of the image in meters by the tangent of half of the vertical field of view. To find the offset, I need to translate all of the coordinates in the image to meters. Afterwards, it can be found by subtracting the object center  by the image center coordinates. To calculate the yaw and pitch, I use the inverse tangent of the offset divided by the distance to find the angle.

From the model below, I have derived the formulas needed to get the distance, offsets, and yaw and pitch. 

In [2]:
import math
import sys

def pixel_to_meter(p: float, scale: float) -> float:
    return (p * scale) / 1000 # pixels to mm to meters

def calc_dist(vfov_height: float, vfov: float) -> float:
    ''' half of vertical POV height divided by tangent of half of vfov '''

    r = math.radians(vfov/2)
    t = math.tan(r)
    d = (vfov_height/2) / t

    return d

def angle_of_points(p1: list, p2: list, dist: float) -> tuple:
    ''' inverse of dist between 2 points over dist to object, for yaw and pitch '''

    yaw_r = math.atan((p1[0] - p2[0]) / dist)
    yaw_d = math.degrees(yaw_r)

    pitch_r = math.atan((p2[1] - p1[1]) / dist)
    pitch_d = math.degrees(pitch_r)

    return yaw_d, pitch_d


def align_to_target(align_in: tuple) -> tuple:
    ''' main function to take input align_in and calculate align_out '''

    imgDimX, imgDimY, centerX, centerY, focalLength, sensorSize, objRealHeight, objImgHeight, hfov, vfov = align_in
    relativeYaw, relativePitch, dist, relativeYOffset, relativeZOffset = 0, 0, 0, 0, 0 

    # height length to pixel scale. Horizontal scale may be different.
    vertical_scale = objRealHeight/float(objImgHeight)

    # convert heights based on pixel scale 
    imgDimRealY = pixel_to_meter(imgDimY, vertical_scale)
    imgCenterY = imgDimRealY / 2
    objCenterY = pixel_to_meter(centerY, vertical_scale) 

    # calculate distance 
    dist = calc_dist(imgDimRealY, vfov) 

    # converts pixels to meters for widths based on hfov and dist
    hfov_width = math.tan(math.radians(hfov/2)) * dist * 2
    imgCenterX = hfov_width / 2 
    objCenterX = centerX/imgDimX * hfov_width 
    
    # difference of how off it is from middle
    relativeYOffset = objCenterX - imgCenterX
    relativeZOffset = objCenterY - imgCenterY

    # calculate yaw and pitch based on object center and img center
    relativeYaw, relativePitch = angle_of_points([objCenterX, objCenterY], [imgCenterX, imgCenterY], dist)

    return (relativeYaw, relativePitch, dist, relativeYOffset, relativeZOffset)


def run():
    
    # Test Cases
    align_in = [
     (512, 341, 300, 150, 8, 8.8, 1524, 80, 77.3, 62),
     (512, 384, 200, 100, 8, 8.8, 250, 100, 135, 122),
     (5472, 3648, 2000, 150, 8, 8.8, 1524, 80, 77.3, 62),
     (5472, 3648, 3583, 4014, 8, 8.8, 1219.2, 1000, 77.3, 62)
     ]

    # expected align.out
    align_out = [
        (8, 4, 6, 1, 0), 
        (-28, 41, 1, 0, -1), 
        (-12, 29, 63, -13, -31),
        (14, -36, 4, 1, 2)
        ]

    # Runs for each test case
    for i in range(len(align_in)):
        print (f'\nExpected output for {align_in[i]}: {align_out[i]}')
        result = align_to_target(align_in[i])
        print (f'Calculated: {result}')


run()


Expected output for (512, 341, 300, 150, 8, 8.8, 1524, 80, 77.3, 62): (8, 4, 6, 1, 0)
Calculated: (7.82636346254104, 4.132111134128282, 5.405621365661543, 0.7430121352548786, -0.39052500000000023)

Expected output for (512, 384, 200, 100, 8, 8.8, 250, 100, 135, 122): (-28, 41, 1, 0, -1)
Calculated: (-27.83894716546775, 40.84143230469546, 0.26606834469732915, -0.14051314512516122, -0.22999999999999998)

Expected output for (5472, 3648, 2000, 150, 8, 8.8, 1524, 80, 77.3, 62): (-12, 29, 63, -13, -31)
Calculated: (-12.140953695817277, 28.874439525681517, 57.82905202912994, -12.440715762568495, -31.889700000000005)

Expected output for (5472, 3648, 3583, 4014, 8, 8.8, 1219.2, 1000, 77.3, 62): (14, -36, 4, 1, 2)
Calculated: (13.905341460603939, -35.80773710248819, 3.7010593298643153, 0.916285760947436, 2.670048)
