In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy import stats

distance_1= np.array([27.33865499, 67.29430297, 41.82178098, 58.39356761, 92.61319068, 38.61317757,
                      57.7139236, 87.32529571, 96.40015232, 66.37223802, 87.51446239, 25.48450751,
                      51.01070577, 79.28980395, 35.00420194, 21.94661777, 80.37865027, 56.41706118,
                      25.71494299, 92.79656478])
velocity_1= np.array([1842.357949, 4737.631086, 2984.032466, 4010.421125, 6402.961282, 2708.661893,
                       4068.202844, 6060.924699, 6792.884249, 4652.245702, 6126.352559, 1766.86552,
                       3572.450529, 5566.567632, 2425.987678, 1498.659551, 5617.102174, 4023.197373,
                       1794.528052, 6511.055124])

distance_2= distance_1
velocity_2= velocity_1

distance_3= np.array([88.7144659, 47.98818881, 56.19079122, 26.50219367, 35.57116948, 64.89797418,
                       90.16201494, 56.65837041, 89.48322569, 82.69194642, 77.49279864, 25.68075188,
                       17.94676025, 26.6265791, 61.25172857, 24.16249983, 85.62747488, 82.49366569,
                       75.63146929, 90.12016586])
velocity_3= np.array([6276.805536, 3379.364009, 3881.284046, 1829.414402, 2496.882592, 4525.791526,
                       6289.84343, 3978.733072, 6403.438857, 5710.467844, 5368.381026, 1782.030891,
                       1271.583113, 1801.607553, 4322.664707, 1666.71876, 5988.714796, 5719.472846,
                       5326.790587, 6379.074542])

def calculate_hubble_constant(distance, velocity):
    data= pd.DataFrame({'Distance': distance, 'Velocity': velocity})
    X= data['Distance'].values.reshape(-1, 1)
    y= data['Velocity'].values
    model= LinearRegression()
    model.fit(X, y)
    hubble_constant= model.coef_[0]

    y_pred= model.predict(X)

    mean_velocity= np.mean(y)
    variance_velocity= np.var(y)

    z_scores= np.abs(stats.zscore(data[['Distance', 'Velocity']]))
    outliers= np.where(z_scores > 3)

    cleaned_data= data.drop(outliers[0])

    X_cleaned= cleaned_data['Distance'].values.reshape(-1, 1)
    y_cleaned= cleaned_data['Velocity'].values

    model_cleaned= LinearRegression()
    model_cleaned.fit(X_cleaned, y_cleaned)
    hubble_constant_cleaned= model_cleaned.coef_[0]

    mean_velocity_cleaned= np.mean(y_cleaned)
    variance_velocity_cleaned= np.var(y_cleaned)

    return {
        "Hubble Constant (With Outliers)": round(hubble_constant, 2),
        "Hubble Constant (Without Outliers)": round(hubble_constant_cleaned, 2),
        "Mean (With Outliers)": round(mean_velocity, 2),
        "Variance (With Outliers)": round(variance_velocity, 2),
        "Mean (Without Outliers)": round(mean_velocity_cleaned, 2),
        "Variance (Without Outliers)": round(variance_velocity_cleaned, 2),
        "Outliers Indexes": outliers[0]
    }

results_1= calculate_hubble_constant(distance_1, velocity_1)
results_2= calculate_hubble_constant(distance_2, velocity_2)
results_3= calculate_hubble_constant(distance_3, velocity_3)

print("Section 1 Results:")
for key, value in results_1.items():
    print(f"{key}: {value}")

print("\nSection 2 Results:")
for key, value in results_2.items():
    print(f"{key}: {value}")

print("\nSection 3 Results:")
for key, value in results_3.items():
    print(f"{key}: {value}")


Section 1 Results:
Hubble Constant (With Outliers): 70.18
Hubble Constant (Without Outliers): 70.18
Mean (With Outliers): 4158.15
Variance (With Outliers): 3015775.26
Mean (Without Outliers): 4158.15
Variance (Without Outliers): 3015775.26
Outliers Indexes: []

Section 2 Results:
Hubble Constant (With Outliers): 70.18
Hubble Constant (Without Outliers): 70.18
Mean (With Outliers): 4158.15
Variance (With Outliers): 3015775.26
Mean (Without Outliers): 4158.15
Variance (Without Outliers): 3015775.26
Outliers Indexes: []

Section 3 Results:
Hubble Constant (With Outliers): 70.56
Hubble Constant (Without Outliers): 70.56
Mean (With Outliers): 4219.95
Variance (With Outliers): 3237907.49
Mean (Without Outliers): 4219.95
Variance (Without Outliers): 3237907.49
Outliers Indexes: []
