<a href="https://colab.research.google.com/github/ParinP25/Statistical-Analysis-Predictive-Modeling-ML/blob/main/Google_AI_Studio_%E2%80%93_AI_Assisted_Development_Session.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Google AI Studio â€“ AI-Assisted Development Session

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def preprocess_dataset(data: pd.DataFrame, target_col: str = None) -> pd.DataFrame:
    # 1. Creating a copy
    df = data.copy()

    # 2. Removing Duplicate Rows
    # Doing this first to make sure duplicates don't skew any calculations later on
    initial_rows = len(df)
    df = df.drop_duplicates()
    print(f"Duplicates removed: {initial_rows - len(df)}")

    # Separating target columns so we don't normalize it
    target = None
    if target_col and target_col in df.columns:
        target = df[target_col]
        df = df.drop(columns=[target_col])

    # Column Types
    # includes integers and floats
    numeric_cols = df.select_dtypes(include=['number']).columns

    # Usually text/categorical
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    # 3. Handling Missing Numeric Values
    # This will fill the missing numeric values with the column mean
    if len(numeric_cols) > 0:
        # Using lambda to apply mean column by column
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # 4. Normalize the Numeric Columns
    # Using Min-Max Scaling to scale values between 0 and 1
    if len(numeric_cols) > 0:
        scaler = MinMaxScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # 5. Encoding Categorical Columns
    # pd.get_dummies automatically handles the remaining columns
    # Filling NaN in categorical cols with "Missing" first to prevent losing rows
    if len(categorical_cols) > 0:
        df[categorical_cols] = df[categorical_cols].fillna("Unknown")

        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    if target is not None:
        df[target_col] = target

    return df

# Example
if __name__ == "__main__":
    # Creating dummy data to test:
    # - Duplicate row (indices 0 and 1)
    # - Missing numeric value (Salary)
    # - Missing categorical value (Department)
    data = {
        'Age': [25, 25, 30, 45, 22],
        'Salary': [50000, 50000, 60000, np.nan, 45000],
        'Department': ['HR', 'HR', 'Engineering', 'Sales', np.nan],
        'Purchased': [0, 0, 1, 1, 0] # Assume this is the target
    }

    raw_df = pd.DataFrame(data)

    print("--- Raw Data ---")
    print(raw_df)
    print("\nProcessing...")

    # Running the function (specifying 'Purchased' as target so it doesn't get normalized)
    clean_df = preprocess_dataset(raw_df, target_col='Purchased')

    print("\n--- Processed Data ---")
    print(clean_df)

--- Raw Data ---
   Age   Salary   Department  Purchased
0   25  50000.0           HR          0
1   25  50000.0           HR          0
2   30  60000.0  Engineering          1
3   45      NaN        Sales          1
4   22  45000.0          NaN          0

Processing...
Duplicates removed: 1

--- Processed Data ---
        Age    Salary  Department_HR  Department_Sales  Department_Unknown  \
0  0.130435  0.333333           True             False               False   
2  0.347826  1.000000          False             False               False   
3  1.000000  0.444444          False              True               False   
4  0.000000  0.000000          False             False                True   

   Purchased  
0          0  
2          1  
3          1  
4          0  


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_linear_model(df: pd.DataFrame, target_col: str):

    # 1. Input Validation
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame.")

    # 2. Separating the Features (X) and Target (y)
    # X contains all columns except the target
    X = df.drop(columns=[target_col])

    # y contains only the target column
    y = df[target_col]

    # 3. Spliting the Data
    # test_size=0.2 satisfies the 20% requirement
    # random_state=42 ensures the split is reproducible (same rows every time)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 4. Initializing and Training the Model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # 5. Making Predictions on the Test Set
    y_pred = model.predict(X_test)

    # 6. Calculating the Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)

    print(f"Model Training Completed.")
    print(f"Test Set Size: {len(X_test)} rows")
    print(f"Mean Squared Error (MSE): {mse:.4f}")

    return model, mse

# Example Usage
if __name__ == "__main__":
    # Creating a simple numeric dataset
    data = {
        'Feature_A': [0.1, 0.5, 0.4, 0.8, 0.2, 0.9, 0.3, 0.6, 0.7, 0.5],
        'Feature_B': [1, 0, 1, 0, 1, 0, 1, 0, 0, 1],
        'Price':     [100, 150, 140, 200, 110, 210, 130, 160, 190, 155]
    }

    clean_df = pd.DataFrame(data)

    # Running the function with 'Price' as the target
    trained_model, error_score = train_linear_model(clean_df, target_col='Price')

    # Example
    # Predicting Price for Feature_A=0.5 and Feature_B=1
    new_data = [[0.5, 1]]
    prediction = trained_model.predict(new_data)
    print(f"\nPredicted Price for inputs {new_data}: {prediction[0]:.2f}")

Model Training Completed.
Test Set Size: 2 rows
Mean Squared Error (MSE): 50.0000

Predicted Price for inputs [[0.5, 1]]: 157.00




In [4]:
def run_linear_regression_pipeline(data: pd.DataFrame, target_col: str):

    # PHASE 1: PREPROCESSING
    print("--- Starting Preprocessing ---")

    # 1. Creating a copy and removing duplicates
    df = data.copy().drop_duplicates()

    # 2. Separating the Target from Features
    # Doing this to prevent the target from being normalized
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in dataset.")

    y = df[target_col]
    X = df.drop(columns=[target_col])

    # 3. Identifying the Column Types
    numeric_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns

    # 4. Handling Numeric Data
    if len(numeric_cols) > 0:
        # Filling missing values with the mean
        X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

        # Normalizing (0 to 1 scaling)
        scaler = MinMaxScaler()
        X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # 5. Handling Categorical Data
    if len(categorical_cols) > 0:
        # Filling missing values to prevent row loss
        X[categorical_cols] = X[categorical_cols].fillna("Unknown")

        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

    print(f"Preprocessing complete. Feature count: {X.shape[1]}")

    # PHASE 2: MODEL TRAINING
    print("--- Starting Training ---")

    # 6. Split Data (20% Test Size)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 7. Train Linear Regression Model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # 8. Evaluating
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Model trained successfully.")
    print(f"MSE: {mse:.4f}")

    return model, mse, X_test

# Example
if __name__ == "__main__":
    # Dummy Dataset
    data = {
        'SquareFeet': [1500, 2000, 1500, 2500, 1800, np.nan],
        'Bedrooms': [3, 4, 3, 5, 3, 2],
        'Neighborhood': ['Urban', 'Suburban', 'Urban', 'Rural', np.nan, 'Suburban'],
        'Price': [300000, 450000, 300000, 500000, 350000, 280000] # Target
    }

    raw_df = pd.DataFrame(data)

    # Running the Pipeline
    trained_model, final_mse, test_data = run_linear_regression_pipeline(raw_df, target_col='Price')

    # Displaying the results
    print("\n--- Pipeline Result ---")
    print(f"Final Mean Squared Error: {final_mse:.2f}")

    # Demonstrating a prediction using the processed test set
    print("\nSample Prediction on Test Set:")
    sample_row = test_data.iloc[[0]]
    prediction = trained_model.predict(sample_row)
    print(f"Inputs:\n{sample_row.to_string(index=False)}")
    print(f"Predicted Price: {prediction[0]:.2f}")

--- Starting Preprocessing ---
Preprocessing complete. Feature count: 5
--- Starting Training ---
Model trained successfully.
MSE: 9143525247.0383

--- Pipeline Result ---
Final Mean Squared Error: 9143525247.04

Sample Prediction on Test Set:
Inputs:
 SquareFeet  Bedrooms  Neighborhood_Suburban  Neighborhood_Unknown  Neighborhood_Urban
        0.5  0.666667                   True                 False               False
Predicted Price: 354378.22
