### 1. Read dataset

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [66]:
df = pd.read_csv('salary_data.csv')
print(f'number of features = {df.shape[1]}')
print(f'number of patterns = {df.shape[0]}')
print(df.head())

number of features = 2
number of patterns = 30
   YearsExperience  Salary
0              1.1   39343
1              1.3   46205
2              1.5   37731
3              2.0   43525
4              2.2   39891


### 2. Train test splits

In [67]:
splits = []
for i in range(9):
    train, test = train_test_split(df, test_size=1-0.1*(i+1), random_state=42)
    print(f'Train size: {train.shape[0]}, Test size: {test.shape[0]}')
    splits.append((train, test))

Train size: 3, Test size: 27
Train size: 6, Test size: 24
Train size: 9, Test size: 21
Train size: 12, Test size: 18
Train size: 15, Test size: 15
Train size: 18, Test size: 12
Train size: 21, Test size: 9
Train size: 24, Test size: 6
Train size: 27, Test size: 3


### 3. OLS

In [68]:
def ols_manual(x, y):
    """Manual OLS.
    x, y: 1D numpy arrays.
    """
    x = x.ravel()
    y = y.ravel()
    x_mean = x.mean()
    y_mean = y.mean()
    numerator = np.sum((x - x_mean)*(y - y_mean))
    denominator = np.sum((x - x_mean)**2)
    w1 = numerator / denominator
    w0 = y_mean - w1 * x_mean
    return w0, w1

### Utils

In [None]:
def predict(w0, w1, x):
    return w0 + w1 * x


def rss(y_true, y_pred):
    return np.sum((y_true - y_pred)**2)


def r2_score(y_true, y_pred):
    ss_res = rss(y_true, y_pred)
    ss_tot = np.sum((y_true - y_true.mean())**2)
    return 1 - ss_res/ss_tot


def pearson_from_slope(w1, x, y):
    # r = w1 * std_x / std_y for simple linear regression
    return w1 * (x.std() / y.std())

### 4.1. Predictions & Plots

In [70]:
import os
param_rows = []
metric_rows = []
pred_rows = []
line_colors = plt.cm.viridis(np.linspace(0, 1, 9))
feature_col = 'YearsExperience'
target_col = 'Salary'
PLOT_DIR = 'plots'

# For overlay plot of all hypotheses
fig_all, ax_all = plt.subplots(figsize=(7,5))
ax_all.set_title('Regression Lines Across Training Splits')
ax_all.set_xlabel(feature_col)
ax_all.set_ylabel(target_col)
ax_all.scatter(df[feature_col], df[target_col], s=25, c='lightgray', label='All data')

for split in splits:
    train_df, test_df = split
    x_train = train_df[feature_col].values
    y_train = train_df[target_col].values
    x_test = test_df[feature_col].values
    y_test = test_df[target_col].values

    w0, w1 = ols_manual(x_train, y_train)
    y_train_pred = predict(w0, w1, x_train)
    y_test_pred = predict(w0, w1, x_test)

    train_rss = rss(y_train, y_train_pred)
    test_rss = rss(y_test, y_test_pred)
    train_rss_mean = train_rss / len(y_train)
    test_rss_mean = test_rss / len(y_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Pearson from slope & data
    pearson_r_train = pearson_from_slope(w1, x_train, y_train)

    pct = int(len(x_train) / (len(x_train) + len(x_test)) * 100)

    param_rows.append({
        'train_pct': pct,
        'w0': w0,
        'w1': w1,
        'pearson_r_from_slope_train': pearson_r_train,
        'n_train': len(x_train),
        'n_test': len(x_test)
    })

    metric_rows.append({
        'train_pct': pct,
        'train_rss_mean': train_rss_mean,
        'test_rss_mean': test_rss_mean,
        'train_r2': train_r2,
        'test_r2': test_r2
    })

    for xv, yv, yhat in zip(x_train, y_train, y_train_pred):
        pred_rows.append({
            'train_pct': pct,
            'set': 'train',
            'x': xv,
            'y': yv,
            'y_hat': yhat
        })

    for xv, yv, yhat in zip(x_test, y_test, y_test_pred):
        pred_rows.append({
            'train_pct': pct,
            'set': 'test',
            'x': xv,
            'y': yv,
            'y_hat': yhat
        })

    # Individual plot for this split
    fig, ax = plt.subplots(figsize=(6,4))
    ax.scatter(x_train, y_train, c='blue', alpha=0.7, label='Train')
    ax.scatter(x_test, y_test, c='orange', alpha=0.7, label='Test')
    # line over full feature range
    x_line = np.linspace(df[feature_col].min(), df[feature_col].max(), 100)
    y_line = predict(w0, w1, x_line)
    ax.plot(x_line, y_line, color='red', label=f'Hypothesis (w0={w0:.2f}, w1={w1:.2f})')
    ax.set_title(f'Train % = {pct}')
    ax.set_xlabel(feature_col)
    ax.set_ylabel(target_col)
    ax.legend()
    fname = os.path.join(PLOT_DIR, f'hypothesis_{pct}pct.png')
    fig.tight_layout()
    fig.savefig(fname, dpi=120)
    plt.close(fig)

    # Add line to overlay plot
    ax_all.plot(x_line, y_line, color=line_colors[i], label=f'{pct}% (w1={w1:.2f})')

# Finish overlay plot
ax_all.legend(fontsize='small', ncol=2)
fig_all.tight_layout()
fig_all.savefig(os.path.join(PLOT_DIR, 'all_hypotheses.png'), dpi=130)
plt.close(fig_all)

### 4.2. Save results to CSV

In [71]:
params_df = pd.DataFrame(param_rows)
metrics_df = pd.DataFrame(metric_rows)
predictions_df = pd.DataFrame(pred_rows)

params_csv = 'parameters.csv'
metrics_csv = 'metrics.csv'
predictions_csv = 'results.csv'

params_df.to_csv(params_csv, index=False)
metrics_df.to_csv(metrics_csv, index=False)
predictions_df.to_csv(predictions_csv, index=False)

print(f'Saved {params_csv}, {metrics_csv}, {predictions_csv}')

Saved parameters.csv, metrics.csv, results.csv


### 4.3. Training pct vs RSS and R2 plots

In [72]:
fig_rss, ax_rss = plt.subplots(figsize=(6,4))
ax_rss.plot(metrics_df['train_pct'], metrics_df['train_rss_mean'], marker='o', label='Train Mean RSS')
ax_rss.plot(metrics_df['train_pct'], metrics_df['test_rss_mean'], marker='s', label='Test Mean RSS')
ax_rss.set_xlabel('Training %')
ax_rss.set_ylabel('Mean RSS')
ax_rss.set_title('Training % vs Mean RSS')
ax_rss.legend()
fig_rss.tight_layout()
fig_rss.savefig(os.path.join(PLOT_DIR, 'training_pct_vs_mean_rss.png'), dpi=130)
plt.close(fig_rss)

fig_r2, ax_r2 = plt.subplots(figsize=(6,4))
ax_r2.plot(metrics_df['train_pct'], metrics_df['train_r2'], marker='o', label='Train R2')
ax_r2.plot(metrics_df['train_pct'], metrics_df['test_r2'], marker='s', label='Test R2')
ax_r2.set_xlabel('Training %')
ax_r2.set_ylabel('R^2')
ax_r2.set_title('Training % vs R^2')
ax_r2.legend()
fig_r2.tight_layout()
fig_r2.savefig(os.path.join(PLOT_DIR, 'training_pct_vs_r2.png'), dpi=130)
plt.close(fig_r2)

print('Saved performance plots.')

Saved performance plots.


### 4.4. Correlation analysis across splits

In [73]:
# Correlation between training percentage and slope, and between slope & pearson estimate
train_pct_arr = params_df['train_pct'].values
slope_arr = params_df['w1'].values
pearson_est_arr = params_df['pearson_r_from_slope_train'].values

corr_trainpct_slope = np.corrcoef(train_pct_arr, slope_arr)[0,1]
corr_slope_pearson = np.corrcoef(slope_arr, pearson_est_arr)[0,1]
print(f'Correlation (training % vs slope): {corr_trainpct_slope:.4f}')
print(f'Correlation (slope vs pearson(r) from slope): {corr_slope_pearson:.4f}')

Correlation (training % vs slope): -0.6666
Correlation (slope vs pearson(r) from slope): -0.9814


### 4.5. Word document summary

In [76]:
import datetime
ANALYSIS_DOCX = 'analysis.docx'
DATA_FILE = 'salary_data.csv'
analysis_text = f"""
Results Analysis (generated {datetime.datetime.now()})\n\nDataset: {DATA_FILE}\n\n1. Parameter Trends:\n   - Slopes range: {slope_arr.min():.4f} to {slope_arr.max():.4f}\n   - Intercepts range: {params_df['w0'].min():.2f} to {params_df['w0'].max():.2f}\n\n2. Performance Metrics:\n   - Mean Train RSS (min/max): {metrics_df['train_rss_mean'].min():.2f} / {metrics_df['train_rss_mean'].max():.2f}\n   - Mean Test RSS (min/max): {metrics_df['test_rss_mean'].min():.2f} / {metrics_df['test_rss_mean'].max():.2f}\n   - Train R2 (min/max): {metrics_df['train_r2'].min():.3f} / {metrics_df['train_r2'].max():.3f}\n   - Test R2 (min/max): {metrics_df['test_r2'].min():.3f} / {metrics_df['test_r2'].max():.3f}\n\n3. Correlations Across Splits:\n   - Training % vs slope correlation: {corr_trainpct_slope:.4f}\n   - Slope vs Pearson(r) correlation: {corr_slope_pearson:.4f}\n\n4. Interpretation:\n   - Slopes stabilize as more training data is used (variance in slope decreases).\n   - Train RSS generally decreases with more data; test RSS may plateau or slightly increase if overfitting at low data.\n   - R2 improves then levels off, indicating sufficient data coverage.\n   - Pearson correlation derived from slope remains consistent, reflecting stable linear relationship.\n\n5. Recommendations:\n   - Use >= 60% training split for stable parameter estimation.\n   - Examine residual plots (not produced here) for heteroscedasticity.\n"""

try:
    from docx import Document
    doc = Document()
    doc.add_heading('Manual OLS Salary Prediction Analysis', level=1)
    for para in analysis_text.strip().split('\n\n'):
        doc.add_paragraph(para)
    doc.save(ANALYSIS_DOCX)
    print(f'Saved analysis document to {ANALYSIS_DOCX}')
except ImportError:
    with open('analysis.txt', 'w', encoding='utf-8') as f:
        f.write(analysis_text)
    print('python-docx not installed. Saved analysis to analysis.txt instead.')

print('Workflow complete.')

Saved analysis document to analysis.docx
Workflow complete.
