In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
np.random.seed(17)

In [6]:
data_path = Path('../data/raw/outliers_homework.csv')
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,date,daily_return,daily_return_2
0,2022-01-03,0.001263,0.003834
1,2022-01-04,-0.020046,-0.009506
2,2022-01-05,0.004739,-0.000535
3,2022-01-06,0.009953,0.012539
4,2022-01-07,0.008872,0.00984


In [7]:
def detect_outliers_iqr(series: pd.Series, k: float = 1.5) -> pd.Series:
    """Return boolean mask for IQR-based outliers.
    Assumptions: distribution reasonably summarized by quartiles; k controls strictness.
    """
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return (series < lower) | (series > upper)

def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
    """Return boolean mask for Z-score outliers where |z| > threshold.
    Assumptions: roughly normal distribution; sensitive to heavy tails.
    """
    mu = series.mean()
    sigma = series.std(ddof=0)
    z = (series - mu) / (sigma if sigma != 0 else 1.0)
    return z.abs() > threshold

In [12]:
target_col = 'daily_return'
df['outlier_iqr'] = detect_outliers_iqr(df[target_col])
df['outlier_z'] = detect_outliers_zscore(df[target_col], threshold=3.0)
df[['outlier_iqr', 'outlier_z']].mean()  # fraction flagged
df.head()

Unnamed: 0,date,daily_return,daily_return_2,outlier_iqr,outlier_z
0,2022-01-03,0.001263,0.003834,False,False
1,2022-01-04,-0.020046,-0.009506,False,False
2,2022-01-05,0.004739,-0.000535,False,False
3,2022-01-06,0.009953,0.012539,False,False
4,2022-01-07,0.008872,0.00984,False,False


In [14]:
summ_all = df[target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})
summ_filtered = df.loc[~df['outlier_iqr'], target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})

comp = pd.concat(
    {
        'all': summ_all,
        'filtered_iqr': summ_filtered
    }, axis=1
)
comp

Unnamed: 0,all,filtered_iqr
mean,-0.001434,-3.9e-05
median,-0.000187,-0.0001
std,0.040579,0.009443


In [17]:
x='daily_return_2'
X_all = df[[x]].to_numpy()
y_all = df[target_col].to_numpy()
X_filtered = df.loc[~df['outlier_iqr'], [x]].to_numpy()
y_filtered = df.loc[~df['outlier_iqr'], target_col].to_numpy()

model_all = LinearRegression().fit(X_all, y_all)
model_flt = LinearRegression().fit(X_filtered, y_filtered)

mae_all = mean_absolute_error(y_all, model_all.predict(X_all))
mae_flt = mean_absolute_error(y_filtered, model_flt.predict(X_filtered))

results = pd.DataFrame({
    'slope': [model_all.coef_[0], model_flt.coef_[0]],
    'intercept': [model_all.intercept_, model_flt.intercept_],
    'r2': [model_all.score(X_all, y_all), model_flt.score(X_filtered, y_filtered)],
    'mae': [mae_all, mae_flt]
}, index=['all', 'filtered_iqr'])
results


Unnamed: 0,slope,intercept,r2,mae
all,1.587569,-0.000373,0.961859,0.006333
filtered_iqr,0.972675,3.1e-05,0.573566,0.004903


### Reflection (≤ 1 page)
- Methods and thresholds used (and why): we used basic satistic methods and a simple linear regression with a threshold of'3'.
- Assumptions behind choices : assuming the data is linear and homoskedactisity
- Observed impact on results : after filtering outliers R^2 decresed which means the outliers were significant and decreased the explanatory power of the dependent variable
