Author: Shcherbakov Illia

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
sns.set(style='whitegrid')

# Helpful display options
pd.options.display.float_format = '{:,.4f}'.format


In [None]:
# Load the Boston dataset (tries fetch_openml, then fallback). If you have a local 'boston.csv' place it in the same folder.
def load_boston_dataframe():
    try:
        from sklearn.datasets import fetch_openml
        b = fetch_openml(name='Boston', version=1, as_frame=True)
        df = b.frame.copy()
        # older naming: target named 'MEDV' sometimes
        if 'MEDV' not in df.columns and 'medv' in df.columns:
            df.rename(columns={'medv':'MEDV'}, inplace=True)
        return df
    except Exception as e:
        print('fetch_openml failed:', e)
    try:
        # scikit-learn older versions
        from sklearn.datasets import load_boston
        b = load_boston()
        df = pd.DataFrame(b.data, columns=b.feature_names)
        df['MEDV'] = b.target
        return df
    except Exception as e:
        print('load_boston failed:', e)
    # fallback to CSV if present
    if os.path.exists('boston.csv'):
        return pd.read_csv('boston.csv')
    raise RuntimeError('Could not load Boston dataset. Provide a local boston.csv or install scikit-learn with openml support.')

df = load_boston_dataframe()
df.head()


## Task 1: Familiarize with dataset

Show basic info and descriptive statistics.

In [None]:
df.info()
df.describe()


## Task 2: Visualizations and descriptive summaries

All plots include titles and axis labels.

In [None]:
# 1) Boxplot for MEDV (Median value of owner-occupied homes)
plt.figure(figsize=(8,6))
sns.boxplot(y=df['MEDV'])
plt.title('Boxplot of MEDV (Median value of owner-occupied homes)')
plt.ylabel('MEDV ($1000s)')
plt.tight_layout()
plt.show()


In [None]:
# 2) Bar plot for the Charles River variable CHAS (0/1 counts)
if 'CHAS' in df.columns:
    counts = df['CHAS'].value_counts().sort_index()
    plt.figure(figsize=(6,4))
    sns.barplot(x=counts.index.astype(str), y=counts.values)
    plt.title('Counts of Tracts Bounding the Charles River (CHAS)')
    plt.xlabel('CHAS (0 = No, 1 = Yes)')
    plt.ylabel('Number of tracts')
    plt.tight_layout()
    plt.show()
else:
    print('CHAS variable not found in dataframe.')


In [None]:
# 3) Boxplot MEDV vs AGE groups (discretize AGE into 3 groups)
age_bins = [0, 35, 70, 200]
age_labels = ['<=35','36-70','>70']
df['AGE_grp'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels, include_lowest=True)
plt.figure(figsize=(8,6))
sns.boxplot(x='AGE_grp', y='MEDV', data=df)
plt.title('MEDV by AGE group (proportion built prior to 1940)')
plt.xlabel('AGE group')
plt.ylabel('MEDV ($1000s)')
plt.tight_layout()
plt.show()


In [None]:
# 4) Scatter plot: NOX vs INDUS
plt.figure(figsize=(8,6))
sns.scatterplot(x='INDUS', y='NOX', data=df)
plt.title('Nitric oxides concentration (NOX) vs proportion of non-retail business acres (INDUS)')
plt.xlabel('INDUS (proportion of non-retail business acres)')
plt.ylabel('NOX (nitric oxides concentration)')
plt.tight_layout()
plt.show()

print('Pearson correlation (NOX, INDUS):', df['NOX'].corr(df['INDUS']))


In [None]:
# 5) Histogram for PTRATIO (pupil-teacher ratio)
plt.figure(figsize=(8,5))
sns.histplot(df['PTRATIO'], bins=15, kde=False)
plt.title('Histogram of Pupil-Teacher Ratio (PTRATIO)')
plt.xlabel('PTRATIO')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


## Task 3: Hypothesis tests (α = 0.05)

We will perform:
1. Two-sample t-test for MEDV groups by CHAS
2. One-way ANOVA for MEDV across AGE groups
3. Pearson correlation test for NOX vs INDUS
4. Regression: effect of DIS on MEDV (OLS)


In [None]:
alpha = 0.05

## 1) T-test: MEDV for CHAS = 1 vs CHAS = 0
group0 = df[df['CHAS']==0]['MEDV']
group1 = df[df['CHAS']==1]['MEDV']
print('N (CHAS=0) =', len(group0), 'N (CHAS=1) =', len(group1))
tstat, pval = stats.ttest_ind(group0, group1, equal_var=False)  # Welch's t-test
print('\nT-test MEDV by CHAS (Welch):')
print('t = %0.4f, p = %0.4f' % (tstat, pval))
if pval < alpha:
    print('Reject H0: there is a significant difference in MEDV between CHAS groups (alpha=0.05)')
else:
    print('Fail to reject H0: no significant difference detected (alpha=0.05)')


In [None]:
## 2) ANOVA: MEDV across AGE groups
groups = [grp.dropna() for name, grp in df.groupby('AGE_grp')['MEDV']]
fstat, pval_anova = stats.f_oneway(*groups)
print('ANOVA MEDV by AGE groups: F = %0.4f, p = %0.4f' % (fstat, pval_anova))
if pval_anova < alpha:
    print('Reject H0: At least one AGE group has different MEDV (alpha=0.05)')
else:
    print('Fail to reject H0: No evidence of difference across AGE groups (alpha=0.05)')


In [None]:
## 3) Pearson correlation: NOX vs INDUS
r, p_corr = stats.pearsonr(df['NOX'], df['INDUS'])
print('Pearson r = %0.4f, p = %0.4f' % (r, p_corr))
if p_corr < alpha:
    print('Reject H0: there is a significant linear relationship between NOX and INDUS (alpha=0.05)')
else:
    print('Fail to reject H0: no evidence of linear relationship (alpha=0.05)')


In [None]:
## 4) Regression: impact of DIS on MEDV
X = sm.add_constant(df['DIS'])
model = sm.OLS(df['MEDV'], X).fit()
print(model.summary())
coef = model.params['DIS']
p_coef = model.pvalues['DIS']
print('\nInterpretation:')
print('Coefficient for DIS = %0.4f' % coef)
if p_coef < alpha:
    print('This coefficient is statistically significant (p < 0.05).')
else:
    print('This coefficient is NOT statistically significant at alpha = 0.05.')


Author: Shcherbakov Illia