In [1]:
%config IPCompleter.use_jedi = False 
%config IPCompleter.evaluation = 'limited' 
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## ***Multi-variable Linear Regression using OLS Method***

In [3]:
class LinearRegressionOLS:
    """
    In this class we will make a constructor and 2 other method named `fit`
    and `predict`
    """
    def __init__(self) -> None:
        self.coeffs_ = None # Initially no cofficients
        self.intercept_ = None
        self.all_thetas = None
    def fit(self , X_train , y_train):
        """
        This will take 2 numpy array or dataframe
        (we will convert it into nparray).

        Equation to find thetas: theta = (X_transpose . X)^-1 . X_transpose . y
        """
        # if not numpy array then convert it cause we have to perform vector operations
        X_train = np.asarray(X_train)
        y_train = np.asarray(y_train)

        # if y is 1D then convert into 2D
        if y_train.ndim == 1:
            y_train = y_train.reshape((-1 , 1))

        # add the intercepting(constant) part(col) at the position of first column
        rows = X_train.shape[0]
        X_train = np.hstack((np.ones((rows , 1)) , X_train))
        # find the transpose of X_train
        X_transpose = X_train.T
        # X_tranpose . X and then inverse and then dot with X_transpose and then again dot with y_train
        partA = np.linalg.pinv(X_transpose.dot(X_train))
        partB = X_transpose.dot(y_train)

        self.all_thetas = partA.dot(partB)

        # set all the cofficients and intercepts
        self.coeffs_ = self.all_thetas[1 : ]
        self.intercept_ = self.all_thetas[0]

    def predict(self , X_test):
        # if predict is called before fit then show error
        if self.all_thetas is None:
            raise RuntimeError("You have to call fit first")
        # Convert X_test into numpy array
        X_test = np.asarray(X_test)
        # add the intercepting(constant) part(col)
        rows = X_test.shape[0]
        X_test = np.hstack((np.ones((rows , 1)) , X_test))
        return np.dot(X_test , self.all_thetas)

## ***Multivariable Linear Regression Using Gradient Descent***

In [4]:
class LinearRegressionGradientDescent:
    def __init__(self , epochs = 100 , learning_rate = 0.001):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.intercept_ = None
        self.coeff_ = None 

    def fit(self , X_train , y_train):
        #If X_test and y_test are  not a array then make it a numpy array so that we can apply vector operations
        X_train = np.asarray(X_train)
        y_train = np.asarray(y_train)
        # if y is 1D then convert it into 2D as sklearn works
        if y_train.ndim == 1:
            y_train = y_train.reshape(-1 , 1) 
        
        rows , cols = X_train.shape 
        # set default values for thetas[seperately for coeffs and intercept]
        self.coeff_ = np.zeros((cols, 1))
        self.intercept_ = 0 
        n = rows 
        
        # Do the iterations
        for epoch in range(self.epochs):
            # Predict y using current values
            y_pred = np.dot(X_train , self.coeff_) + self.intercept_
            # Find the error: error = y_true - y_pred 
            error = y_train - y_pred

            # Find the gradients
            gradient_slope = (-2 / n) * np.dot(X_train.T , error)
            gradient_intercept = (-2 / n) * np.sum(error) 

            # update both values
            self.coeff_ = self.coeff_ - self.learning_rate * gradient_slope
            self.intercept_ = self.intercept_ - self.learning_rate * gradient_intercept
        
            
    def predict(self , X_test):
        """We can't use predict before using fit"""
        if self.coeff_ is None or self.intercept_ is None:
            raise RuntimeError("You must need to call fit() before predict().")

        rows = X_test.shape[0]
        # If X_test is  not a array then make it a numpy array so that we can apply vector operations
        X_test = np.asarray(X_test) 
        # Now find the predict value
        return np.dot(X_test , self.coeff_) + self.intercept_

## ***Implementing PolyFeature Class of Scikit Learn For Polynomial Regression***

In [None]:
from itertools import combinations_with_replacement
class PolyFeatures:
    """
        Custom implementation of polynomial feature generator (like sklearn's PolynomialFeatures).
        
        This class expands a feature matrix X into a new one with polynomial combinations
        of features up to a given degree.
        
        Parameters:
        -----------
        degree : int
            Maximum degree of the polynomial features (e.g., 2 means square and pairwise products).
            
        include_bias : bool
            If True, includes a bias column (constant feature = 1).
    
        Attributes:
        -----------
        n_input_features : int
            Number of features in the original input data.
            
        powers : list of tuples
            Each tuple describes which feature indices to multiply (with repetition).
    """
    def __init__(self , degree = 2 , include_bias = True):
        self.degree = degree
        self.include_bias = include_bias # if true means we will add a constant intercept values
        self.n_input_features - None
        self.powers = None # will be a list of combinations of features like (0,), (1,), (0,1), (1,1), etc.

    def _generate_combinations(self , n_features):
        """ 
        Internal method to generate index-based combinations of feature powers
        needed to compute all polynomial features up to the given degree.

        Returns:
        --------
        combs : list of tuples
            Each tuple contains indices of columns to multiply together.
            
        Example: degree = 2 , n_features = [x1,x2]
        () --> intercept
        (0 ,) --> X1
        (1 , ) --> X2
        (0 , 1) --> X1 * X2
        (0 , 0) --> X1 * X1
        (1 , 1) --> X2 * X2
        """
        combs = []
        # if include bias is True means 1 will be added means feature to the power = 0
        # if false then no 1 so start powers from 1
        start = 0 if self.include_bias else 1

        for d in range(start , self.degree + 1):
            # Example: for degree = 2 and n_features = 2 => combinations (0,), (1,), (0,0), (0,1), (1,1)
            # combinations_with_replacement --> takes 2 things: list where to make combination and size of each combination
            combs.extend(combinations_with_replacement(range(n_features) , d))
        return combs

    def fit(self , X):
        """
        Fit the transformer to X — learns the number of input features and
        generates all combinations of features required for polynomial expansion.
        
        Parameters:
        -----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        Returns:
        --------
        self : instance of MyPolynomialFeatures
            Fitted transformer.
        """
        

## **Making a Dataset for linear regression**

In [5]:
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    "age": np.random.randint(20, 60, size=n_samples),
    "income": np.random.normal(50000, 15000, size=n_samples).astype(int),
    "credit_score": np.random.randint(300, 850, size=n_samples),
    "gender": np.random.choice(["male", "female"], size=n_samples),
    "ethnicity": np.random.choice(["group_A", "group_B"], size=n_samples),
})

In [6]:
df.head()

Unnamed: 0,age,income,credit_score,gender,ethnicity
0,58,40588,542,male,group_B
1,48,42569,331,female,group_A
2,34,56282,690,female,group_B
3,27,44160,576,male,group_B
4,40,31475,831,male,group_B


### Add target column

In [7]:
df['loan_amount'] = (
    df['income'] * 1.2 +
    df['credit_score'] * 100 +
    df['age'] * 50 +
    np.where(df['gender'] == 'male', 1000, -1000) +
    np.where(df['ethnicity'] == 'group_A', 5000, -5000) +
    np.random.normal(0, 10000, size=n_samples)  # noise
).astype(int)

In [8]:
df.head()

Unnamed: 0,age,income,credit_score,gender,ethnicity,loan_amount
0,58,40588,542,male,group_B,106623
1,48,42569,331,female,group_A,83624
2,34,56282,690,female,group_B,146252
3,27,44160,576,male,group_B,96525
4,40,31475,831,male,group_B,115102


## **Split the train test split**

In [9]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(
    df.drop(columns = ['loan_amount']),
    df['loan_amount'],
    test_size = 0.2,
    random_state = 2
    )

In [10]:
X_train.head()

Unnamed: 0,age,income,credit_score,gender,ethnicity
175,47,41983,624,male,group_B
818,22,85454,700,female,group_B
677,36,25837,711,male,group_B
952,52,67403,492,female,group_B
200,25,10291,478,female,group_A


<h3> **Use Column Transformation on Categorical value and standarization on income column**

# make a pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OneHotEncoder , OrdinalEncoder
from sklearn.compose import ColumnTransformer 

In [12]:
X_train.columns

Index(['age', 'income', 'credit_score', 'gender', 'ethnicity'], dtype='object')

In [13]:
# Define columns
numerical_cols = ['age', 'income', 'credit_score']
categorical_cols = ['gender']
ordinal_cols = ['ethnicity']
ethnicity_order = ['group_A', 'group_B']

In [14]:
gender_categories = [df['gender'].unique().tolist()]
gender_categories

[['male', 'female']]

In [15]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num' , StandardScaler() , numerical_cols), 
        ('cat' , OneHotEncoder(categories = gender_categories , handle_unknown = 'ignore') , categorical_cols), 
        ('ord' , OrdinalEncoder(categories = [ethnicity_order] , handle_unknown = 'use_encoded_value' , unknown_value = -1) , ordinal_cols)
    ]
)

In [16]:
# Create the full pipeline
pipeline1 = Pipeline(steps = [
    ('preprocessing' , preprocessor), 
    ('Linear-Regression' , LinearRegressionOLS())
])

In [17]:
pipeline1.fit(X_train , y_train)

In [18]:
y_pred_ols = pipeline1.predict(X_test)

In [19]:
from sklearn.metrics import mean_squared_error , r2_score

In [20]:
print(f"r2_score: {r2_score(y_test , y_pred_ols)}")
print(f"mean_squared_error: {mean_squared_error(y_test , y_pred_ols)}")

r2_score: 0.8985228819173744
mean_squared_error: 89550765.2262354


## ***Now use the same pipeline but just use Linear Regression Gradient Descent***

In [21]:
# Create the full pipeline
pipeline2 = Pipeline(steps = [
    ('preprocessing' , preprocessor), 
    ('Linear-Regression' , LinearRegressionGradientDescent(learning_rate = 0.05 , epochs = 1000))
])

In [22]:
pipeline2.fit(X_train , y_train)

In [23]:
y_pred_gd = pipeline2.predict(X_test)

In [24]:
print(f"r2_score: {r2_score(y_test , y_pred_gd)}")
print(f"mean_squared_error: {mean_squared_error(y_test , y_pred_gd)}")

r2_score: 0.8985228818771334
mean_squared_error: 89550765.261747
