In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings("ignore")


### Data Collection Pipeline

In [None]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    data = pd.read_csv(file_path)
    return data

data = load_data("data/path_to_your_data.csv")
data.head()


### EDA Pipeline 

In [None]:
def plot_numerical_features(df):
    """Plot histograms and box plots for numerical features."""
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    plt.figure(figsize=(12, 10))
    for i, feature in enumerate(numeric_features, 1):
        plt.subplot(3, 3, i)
        sns.histplot(df[feature], bins=30, kde=True)
        plt.title(f"Distribution of {feature}")
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(12, 10))
    for i, feature in enumerate(numeric_features, 1):
        plt.subplot(3, 3, i)
        sns.boxplot(y=df[feature])
        plt.title(f"Boxplot of {feature}")
    plt.tight_layout()
    plt.show()

def plot_categorical_features(df):
    """Plot count plots for categorical features."""
    categorical_features = df.select_dtypes(include=[object]).columns.tolist()
    
    plt.figure(figsize=(12, 10))
    for i, feature in enumerate(categorical_features, 1):
        plt.subplot(3, 3, i)
        sns.countplot(y=df[feature])
        plt.title(f"Count of {feature}")
    plt.tight_layout()
    plt.show()

# EDA
plot_numerical_features(data)
plot_categorical_features(data)


### Data Preprocessing Pipeline

In [None]:
def preprocess_data(df, target_column):
    """Preprocess data including missing values, feature selection, and scaling."""
    # Identify numeric and categorical features
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(include=[object]).columns.tolist()
    
    # Remove target column from feature lists
    numeric_features.remove(target_column)

    # Handle missing values
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Feature Selection
    threshold = 0.1  # Customize threshold for variance
    low_variance_features = df.var() < threshold
    features_to_drop = low_variance_features[low_variance_features].index.tolist()
    
    # Drop low variance features
    df = df.drop(columns=features_to_drop)

    # Fit and transform the data
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    X_preprocessed = preprocessor.fit_transform(X)

    # Save the preprocessor
    joblib.dump(preprocessor, '../models/preprocessor.pkl')

    # Return processed data
    return X_preprocessed, y


### Data Splitting Pipeline

In [None]:
def split_data(X, y):
    """Split data into training, validation, and test sets."""
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

X, y = preprocess_data(data, 'target_column')
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)


### Model Training and Prior Evaluation Pipeline

In [None]:
def train_models(X_train, y_train):
    """Train multiple models and return the best one based on Mean Squared Error."""
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(),
        'Support Vector Regressor': SVR()
    }
    
    best_model = None
    best_score = float('inf')
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_train)
        mse = mean_squared_error(y_train, predictions)
        
        print(f"{model_name} - Mean Squared Error: {mse:.2f}")
        
        if mse < best_score:
            best_score = mse
            best_model = model
            
    joblib.dump(best_model, '../models/best_model.pkl')
    return best_model

best_model = train_models(X_train, y_train)


### Hyper Parameter Tuning 

In [None]:
def hyperparameter_tuning(X_train, y_train):
    """Tune hyperparameters for the best model and return it."""
    param_grid = {
        'Random Forest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [10, 20, 30]
        },
        'Support Vector Regressor': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }

    best_model = None
    best_score = float('inf')
    
    for model_name in param_grid.keys():
        grid = GridSearchCV(models[model_name], param_grid[model_name], cv=3, scoring='neg_mean_squared_error')
        grid.fit(X_train, y_train)
        
        print(f"{model_name} - Best Parameters: {grid.best_params_}")
        print(f"{model_name} - Best Score: {grid.best_score_:.2f}")
        
        if grid.best_score_ < best_score:
            best_score = grid.best_score_
            best_model = grid.best_estimator_

    joblib.dump(best_model, '../models/best_model.pkl')
    return best_model

best_model = hyperparameter_tuning(X_train, y_train)


### Best Model Evaluation Pipeline

In [None]:
def evaluate_model(model, X, y, set_name="Validation"):
    """Evaluate the model using Mean Squared Error and R² Score."""
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    r2 = r2_score(y, predictions)
    
    print(f"{set_name} Set Evaluation:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R² Score: {r2:.2f}")

# Evaluate on validation and test sets
evaluate_model(best_model, X_val, y_val, set_name="Validation")
evaluate_model(best_model, X_test, y_test, set_name="Test")


### Model Summary and Saving

In [None]:
def save_model_summary(model):
    """Save model summary to a text file."""
    with open('../models/model_summary.txt', 'w') as f:
        f.write(f"Best Model: {model.__class__.__name__}\n")
        f.write(f"Mean Squared Error: {mean_squared_error(y_val, model.predict(X_val)):.2f}\n")
        f.write(f"R² Score: {r2_score(y_val, model.predict(X_val)):.2f}\n")

save_model_summary(best_model)


### Deployment Prior to Real production Deployment

In [None]:
import joblib
import pandas as pd

def deploy_model(new_data_path):
    """Load the model and preprocessor, then make predictions on new data."""
    preprocessor = joblib.load('models/preprocessor.pkl')
    model = joblib.load('models/best_model.pkl')
    
    # Load new data
    new_data = pd.read_csv(new_data_path)
    
    # Preprocess new data
    X_new = preprocessor.transform(new_data)
    
    # Make predictions
    predictions = model.predict(X_new)
    
    return predictions

# Example of how to use the deployment function
predictions = deploy_model('path_to_new_data.csv')
print(predictions)
