#### Day 4: Tuesday, April 29, 2025 – Cleaning Outliers in Crypto Data

In [None]:
import pandas as pd

#### Detect and handle outliers in `volume_eth` (1e12 at row 10) using IQR method.

In [None]:
df_cleaned = pd.read_csv('crypto_market_data.csv')
df_cleaned.head()

#### # Detect outliers in volume_eth Метод IQR считает любое значение ниже Q1 - 1.5 * IQR потенциальным выбросом. Множитель 1.5 — это стандартное значение, используемое в методе IQR для определения порога выбросов.

In [None]:
Q1 = df_cleaned['volume_eth'].quantile(0.25)
Q1

In [None]:
Q3 = df_cleaned['volume_eth'].quantile(.75)
Q3

In [None]:
# IQR означает межквартильный размах (Interquartile Range), который равен разнице между 
# третьим квартилем (Q3) и первым квартилем (Q1).

IQR = Q3 - Q1
IQR

In [None]:
# Эта строка вычисляет нижнюю границу для определения выбросов.
# Метод IQR считает любое значение ниже Q1 - 1.5 * IQR потенциальным выбросом
# Множитель 1.5 — это стандартное значение, используемое в методе IQR для определения порога выбросов.

lower_bound = Q1 - 1.5 * IQR
lower_bound

In [None]:
upper_bound = Q3 + 1.5 * IQR
upper_bound

In [None]:
outliers = (df_cleaned["volume_eth"] < lower_bound) | (df_cleaned["volume_eth"] > upper_bound)
print("Outliers in volume_eth:\n", df_cleaned[outliers][["date", "volume_eth"]])

#### Replace with median

In [None]:
df_cleaned_replaced = df_cleaned.copy()
df_cleaned_replaced.head()

In [None]:
median_volume = df_cleaned['volume_eth'].median()

# This line replaces the values in the volume_eth column of df_cleaned_replaced with median_volume 
# for all rows where outliers is True.
df_cleaned_replaced.loc[outliers, 'volume_eth'] = median_volume

print("Volume ETH after replacing:\n", df_cleaned_replaced["volume_eth"].describe())

#### Apply IQR method to detect outliers in `close_bnb` (0 at row 15).

In [None]:
df = pd.read_csv('crypto_market_data.csv')
df_cleaned = df.copy()

In [None]:
Q1 = df_cleaned['close_bnb'].quantile(.25)
Q1

In [None]:
Q3 = df_cleaned['close_bnb'].quantile(.75)
Q3

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_bound = Q1 - 1.5 * IQR
lower_bound

In [None]:
upper_bound = Q3 + 1.5 * IQR
upper_bound

In [None]:
outliers = (df_cleaned['close_bnb'] < lower_bound) | (df_cleaned['close_bnb'] > upper_bound)
print("Outliers in close_bnb:\n", df_cleaned[outliers][["date", "close_bnb"]])

#### Replace outliers in `close_bnb` with the mean.

In [None]:
mean = df_cleaned['close_bnb'].mean()
mean

In [None]:
df_cleaned.loc[outliers, ['close_bnb']] = mean
df_cleaned['close_bnb'].describe()

#### Compare standard deviation of `volume_eth` before and after outlier removal.

In [None]:
df['close_bnb'].describe()

#### Mini-Project**: Create a DataFrame with no outliers in `volume_eth` and `close_bnb`, save to `crypto_cleaned.csv`.

In [None]:
df = pd.read_csv('crypto_market_data.csv')
df.head()

In [None]:
df_cleaned = df.copy()
df_cleaned.head()

In [None]:
medians = df_cleaned[['volume_eth', 'close_bnb']].median()
medians['volume_eth']

#### For volume_eth

In [None]:
Q1 = df['volume_eth'].quantile(.25)
Q1

In [None]:
Q3 = df['volume_eth'].quantile(.75)
Q3

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_bound = Q1 - 1.5 * IQR
lower_bound

In [None]:
upper_bound = Q3 + 1.5 * IQR
upper_bound

In [None]:
outliers = (df['volume_eth'] < lower_bound)  | (df['volume_eth'] > upper_bound)

df_cleaned.loc[outliers, ['volume_eth']] = medians['volume_eth']
df_cleaned['volume_eth'].describe()

#### For close_bnb

In [None]:
Q1 = df['close_bnb'].quantile(.25)
Q3 = df['close_bnb'].quantile(.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (df['close_bnb'] < lower_bound) | (df['close_bnb'] > upper_bound)

df_cleaned.loc[outliers, ['close_bnb']] = medians['close_bnb']

df_cleaned['close_bnb'].describe()

In [1]:
import pandas as pd
df = pd.read_csv("crypto_market_data.csv")
# Определяем события
A = df["close_btc"] > 86206.417767
B = df["volume_eth"] > 18213370722.92413
# Вычисляем вероятности
P_A = A.mean()  # Доля строк, где A истинно
P_B = B.mean()  # Доля строк, где B истинно
P_A_and_B = (A & B).mean()  # Доля строк, где оба истинны
print(f"P(A) = {P_A}, P(B) = {P_B}, P(A и B) = {P_A_and_B}")
print(f"P(A) × P(B) = {P_A * P_B}")
if abs(P_A_and_B - P_A * P_B) < 0.01:  # Небольшая погрешность
    print("События независимы")
else:
    print("События зависимы")

P(A) = 0.2413793103448276, P(B) = 0.3103448275862069, P(A и B) = 0.06896551724137931
P(A) × P(B) = 0.07491082045184305
События независимы
