# Linear Regression Practice

This notebook covers linear regression implementation and practice using Python.

## Import Required Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

plt.style.use('seaborn-v0_8')
np.random.seed(42)

## 1. Simple Linear Regression from Scratch

In [None]:
class SimpleLinearRegression:
    def __init__(self):
        self.slope = None
        self.intercept = None
    
    def fit(self, X, y):
        # Calculate slope and intercept using least squares
        X_mean = np.mean(X)
        y_mean = np.mean(y)
        
        # Calculate slope (m)
        numerator = np.sum((X - X_mean) * (y - y_mean))
        denominator = np.sum((X - X_mean) ** 2)
        self.slope = numerator / denominator
        
        # Calculate intercept (b)
        self.intercept = y_mean - self.slope * X_mean
    
    def predict(self, X):
        return self.slope * X + self.intercept
    
    def score(self, X, y):
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

print("Simple Linear Regression class created successfully!")

## 2. Generate Sample Data

In [None]:
# Generate synthetic data
n_samples = 100
X = np.linspace(0, 10, n_samples)
y = 2.5 * X + 1.5 + np.random.normal(0, 1.5, n_samples)

# Plot the data
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, color='blue')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Generated Sample Data')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Generated {n_samples} data points")
print(f"X range: [{X.min():.2f}, {X.max():.2f}]")
print(f"y range: [{y.min():.2f}, {y.max():.2f}]")

## 3. Train Custom Linear Regression

In [None]:
# Train our custom model
custom_model = SimpleLinearRegression()
custom_model.fit(X, y)

# Make predictions
y_pred_custom = custom_model.predict(X)

# Display results
print(f"Custom Model Results:")
print(f"Slope: {custom_model.slope:.4f}")
print(f"Intercept: {custom_model.intercept:.4f}")
print(f"R² Score: {custom_model.score(X, y):.4f}")
print(f"Equation: y = {custom_model.slope:.4f}x + {custom_model.intercept:.4f}")

## 4. Compare with Scikit-Learn

In [None]:
# Train sklearn model
sklearn_model = LinearRegression()
X_reshaped = X.reshape(-1, 1)  # sklearn expects 2D array
sklearn_model.fit(X_reshaped, y)

# Make predictions
y_pred_sklearn = sklearn_model.predict(X_reshaped)

# Display results
print(f"Scikit-Learn Model Results:")
print(f"Slope: {sklearn_model.coef_[0]:.4f}")
print(f"Intercept: {sklearn_model.intercept_:.4f}")
print(f"R² Score: {sklearn_model.score(X_reshaped, y):.4f}")
print(f"Equation: y = {sklearn_model.coef_[0]:.4f}x + {sklearn_model.intercept_:.4f}")

print("\nComparison:")
print(f"Slope difference: {abs(custom_model.slope - sklearn_model.coef_[0]):.6f}")
print(f"Intercept difference: {abs(custom_model.intercept - sklearn_model.intercept_):.6f}")

## 5. Visualize Results

In [None]:
plt.figure(figsize=(12, 8))

# Plot original data
plt.scatter(X, y, alpha=0.6, color='blue', label='Data Points')

# Plot both regression lines
plt.plot(X, y_pred_custom, 'r-', linewidth=2, label='Custom Linear Regression')
plt.plot(X, y_pred_sklearn, 'g--', linewidth=2, label='Scikit-Learn Linear Regression')

plt.xlabel('X', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Linear Regression Comparison', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

## 6. Real Dataset Example - Housing Data

In [None]:
# Create a synthetic housing dataset
np.random.seed(42)
n_houses = 500

# Features: rooms, age, distance to employment centers
rooms = np.random.normal(6, 1, n_houses)
age = np.random.uniform(0, 100, n_houses)
distance = np.random.exponential(3, n_houses)

# Price based on features with some noise
price = (rooms * 8 - age * 0.1 - distance * 2 + np.random.normal(0, 3, n_houses))
price = np.maximum(price, 5)  # Ensure positive prices

# Create DataFrame
housing_data = pd.DataFrame({
    'rooms': rooms,
    'age': age,
    'distance': distance,
    'price': price
})

print("Housing Dataset:")
print(housing_data.head())
print(f"\nDataset shape: {housing_data.shape}")
print("\nBasic statistics:")
print(housing_data.describe())

## 7. Multiple Linear Regression

In [None]:
# Prepare features and target
X_multi = housing_data[['rooms', 'age', 'distance']]
y_multi = housing_data['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Train the model
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

# Make predictions
y_train_pred = multi_model.predict(X_train)
y_test_pred = multi_model.predict(X_test)

# Evaluate the model
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Multiple Linear Regression Results:")
print(f"Training R² Score: {train_r2:.4f}")
print(f"Testing R² Score: {test_r2:.4f}")
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Testing RMSE: {test_rmse:.4f}")

print("\nModel Coefficients:")
for feature, coef in zip(X_multi.columns, multi_model.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {multi_model.intercept_:.4f}")

## 8. Visualization of Results

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Actual vs Predicted
axes[0, 0].scatter(y_test, y_test_pred, alpha=0.6)
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Price')
axes[0, 0].set_ylabel('Predicted Price')
axes[0, 0].set_title('Actual vs Predicted Prices')
axes[0, 0].grid(True, alpha=0.3)

# Residuals plot
residuals = y_test - y_test_pred
axes[0, 1].scatter(y_test_pred, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Price')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residuals Plot')
axes[0, 1].grid(True, alpha=0.3)

# Feature importance
feature_names = X_multi.columns
coefficients = multi_model.coef_
axes[1, 0].bar(feature_names, coefficients)
axes[1, 0].set_title('Feature Coefficients')
axes[1, 0].set_ylabel('Coefficient Value')
plt.setp(axes[1, 0].xaxis.get_majorticklabels(), rotation=45)

# Distribution of residuals
axes[1, 1].hist(residuals, bins=20, alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Residuals')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution of Residuals')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Exercise: Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Generate non-linear data
X_poly = np.linspace(0, 4, 100)
y_poly = 0.5 * X_poly**3 - 2 * X_poly**2 + X_poly + 1 + np.random.normal(0, 1, 100)

# Reshape for sklearn
X_poly_reshaped = X_poly.reshape(-1, 1)

# Try different polynomial degrees
degrees = [1, 2, 3, 4]
plt.figure(figsize=(15, 10))

for i, degree in enumerate(degrees, 1):
    # Create polynomial features and fit model
    poly_model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    
    poly_model.fit(X_poly_reshaped, y_poly)
    y_poly_pred = poly_model.predict(X_poly_reshaped)
    
    # Plot
    plt.subplot(2, 2, i)
    plt.scatter(X_poly, y_poly, alpha=0.6, color='blue')
    plt.plot(X_poly, y_poly_pred, 'r-', linewidth=2)
    plt.title(f'Polynomial Degree {degree} (R² = {poly_model.score(X_poly_reshaped, y_poly):.3f})')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Key Takeaways

### What we covered:
1. **Simple Linear Regression from scratch** - Understanding the mathematics
2. **Comparison with Scikit-Learn** - Validating our implementation
3. **Multiple Linear Regression** - Handling multiple features
4. **Model Evaluation** - R², RMSE, residual analysis
5. **Polynomial Regression** - Handling non-linear relationships

### Key Concepts:
- **Least Squares Method**: Minimizes sum of squared residuals
- **R² Score**: Explains variance in the target variable
- **Residual Analysis**: Check for patterns indicating model issues
- **Overfitting**: Higher degree polynomials may overfit
