In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from scipy import stats
from flask import Flask, jsonify, request
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np
import category_encoders as ce
from scipy import stats
import os
from flask import request, jsonify
from flask import Flask, request, jsonify
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import json


In [2]:
def load_and_combine_data(file_paths):
    """Load multiple Excel files and combine them into a single DataFrame."""
    dfs = [pd.read_excel(file) for file in file_paths]
    return pd.concat(dfs, ignore_index=True)

def clean_columns(data):
    """Drop irrelevant columns."""
    return data.drop(['Commodity', 'Grade', 'Sex'], axis=1)

def replace_missing_values(data):
    """Replace hyphens with NaN to handle missing values."""
    data.replace(['-', ' - ', '- ', ' -'], np.nan, inplace=True)
    return data

def convert_price_columns(data, price_columns):
    """Convert price columns to numerical values."""
    for col in price_columns:
        data[col] = data[col].str.lower().str.replace("/kg", "").str.strip()
        data[col] = data[col].str.replace("s", "").str.strip().astype(float)
    return data

def impute_missing_values(data, columns, n_neighbors=5):
    """Use KNN imputer to fill in missing values."""
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    data[columns] = knn_imputer.fit_transform(data[columns])
    return data

def filter_markets(data, threshold=10):
    """Filter out markets with less than a specified number of records."""
    market_counts = data["Market"].value_counts()
    markets_to_keep = market_counts[market_counts >= threshold].index
    return data[data['Market'].isin(markets_to_keep)]

def remove_outliers(data, columns):
    """Remove rows with outliers based on z-score thresholding."""
    outliers = np.zeros(data.shape[0], dtype=bool)
    for col in columns:
        z_scores = stats.zscore(data[col].dropna())
        outliers = outliers | (np.abs(z_scores) > 3)
    return data[~outliers]

def export_data(data, file_name="clean_data.csv"):
    """Export cleaned data to a CSV file."""
    data.to_csv(file_name, index=False)

def main():
    # Define file paths and columns
    file_paths = [
        "raw data/Market Prices.xls", "raw data/Market Prices 2.xls",
        "raw data/Market Prices 3.xls", "raw data/Market Prices 4.xls",
        "raw data/Market Prices 5.xls", "Raw Data/Market Prices 6.xls",
        "raw data/Market Prices 7.xls", "Raw Data/Market Prices 8.xls"
    ]
    price_columns = ["Wholesale", "Retail"]
    knn_columns = ["Supply Volume", "Retail", "Wholesale"]
    num_columns = ["Retail", "Wholesale", "Supply Volume"]

    # Process the data
    data = load_and_combine_data(file_paths)
    data = clean_columns(data)
    data = replace_missing_values(data)
    data = convert_price_columns(data, price_columns)
    data = impute_missing_values(data, knn_columns)
    data = data.dropna()
    data.sort_values(by=['County', 'Market', 'Classification', 'Date'], inplace=True)
    data = filter_markets(data, threshold=10)
    data = remove_outliers(data, num_columns)
    data = data.drop_duplicates()

    # Export cleaned data
    export_data(data, "clean_data2.csv")
    print("Data cleaning and export complete.")

# Run the main function if this script is executed
if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'raw data/Market Prices.xls'

In [None]:
def load_data(file_path):
    """Load the cleaned CSV data."""
    return pd.read_csv(file_path)

def extract_time_features(data):
    """Extract year, month, day, day of the week, and quarter from date."""
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Quarter'] = data['Date'].dt.quarter
    return data

def add_cyclic_features(data):
    """Add cyclic features for month, day, day of the week, quarter, and year."""
    data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
    data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
    data['Day_sin'] = np.sin(2 * np.pi * data['Day'] / 31)
    data['Day_cos'] = np.cos(2 * np.pi * data['Day'] / 31)
    data['DayOfWeek_sin'] = np.sin(2 * np.pi * data['DayOfWeek'] / 7)
    data['DayOfWeek_cos'] = np.cos(2 * np.pi * data['DayOfWeek'] / 7)
    data['Quarter_sin'] = np.sin(2 * np.pi * data['Quarter'] / 4)
    data['Quarter_cos'] = np.cos(2 * np.pi * data['Quarter'] / 4)
    year_range = data['Year'].max() - data['Year'].min()
    data['Year_sin'] = np.sin(2 * np.pi * (data['Year'] - data['Year'].min()) / year_range)
    data['Year_cos'] = np.cos(2 * np.pi * (data['Year'] - data['Year'].min()) / year_range)
    return data

def add_lagged_features(data, lag_days=7):
    """Add lagged features for wholesale, retail, and supply volume."""
    for lag in [lag_days]: 
        data[f'Wholesale_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Wholesale'].shift(lag)
        data[f'Retail_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Retail'].shift(lag)
        data[f'Supply_Volume_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Supply Volume'].shift(lag)
    return data

def add_rolling_features(data, rolling_windows={'7d': 7}):
    """Add rolling mean and std features for wholesale, retail, and supply volume."""
    data = data.sort_values(by=['Market', 'Classification', 'Date'])
    for window_name, window_size in rolling_windows.items():
        for column in ['Wholesale', 'Retail', 'Supply Volume']:
            data[f'{column}_rolling_mean_{window_name}'] = data.groupby(['Market', 'Classification'])[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
            data[f'{column}_rolling_std_{window_name}'] = data.groupby(['Market', 'Classification'])[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).std())
    return data.bfill().ffill()

def encode_categorical_features(data, columns_to_encode):
    """Binary encode specified categorical columns."""
    binary_encoder = ce.BinaryEncoder(cols=columns_to_encode, return_df=True)
    return binary_encoder.fit_transform(data)

def filter_columns_by_correlation(data, target_columns, threshold=0.1):
    """Filter columns based on correlation with target columns."""
    correlation_matrix = data.corr()
    correlation_with_target = correlation_matrix[target_columns]
    filtered_columns = correlation_with_target[(correlation_with_target['Retail'].abs() > threshold) | 
                                               (correlation_with_target['Wholesale'].abs() > threshold)]
    filtered_column_names = [col for col in filtered_columns.index if col not in target_columns]
    return data[filtered_column_names + target_columns]

def export_modeling_data(data, file_name="modeling_data.csv"):
    """Export the final dataset for modeling."""
    data.to_csv(file_name, index=False)

def feature_engineering_pipeline():
    # Load and process data
    data = load_data("clean_data2.csv")
    data = extract_time_features(data)
    data = add_cyclic_features(data)
    data = add_lagged_features(data)
    data = add_rolling_features(data, rolling_windows={'7d': 7})
    
    # Encode and filter features
    data = encode_categorical_features(data, columns_to_encode=['County', 'Market', 'Classification'])
    final_data = filter_columns_by_correlation(data, target_columns=['Retail', 'Wholesale'], threshold=0.1)
    
    # Export data
    export_modeling_data(final_data, "modeling_data_2.csv")
    print("Feature engineering and export complete.")

# Run the feature engineering pipeline
if __name__ == "__main__":
    feature_engineering_pipeline()


In [None]:
# app = Flask(__name__)

# Step 1: Define each function for the pipeline

def load_data(file_path="clean_data2.csv"):
    return pd.read_csv(file_path)

def extract_time_features(data):
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Quarter'] = data['Date'].dt.quarter
    return data

def add_cyclic_features(data):
    data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
    data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
    data['Day_sin'] = np.sin(2 * np.pi * data['Day'] / 31)
    data['Day_cos'] = np.cos(2 * np.pi * data['Day'] / 31)
    data['DayOfWeek_sin'] = np.sin(2 * np.pi * data['DayOfWeek'] / 7)
    data['DayOfWeek_cos'] = np.cos(2 * np.pi * data['DayOfWeek'] / 7)
    data['Quarter_sin'] = np.sin(2 * np.pi * data['Quarter'] / 4)
    data['Quarter_cos'] = np.cos(2 * np.pi * data['Quarter'] / 4)
    year_range = data['Year'].max() - data['Year'].min()
    data['Year_sin'] = np.sin(2 * np.pi * (data['Year'] - data['Year'].min()) / year_range)
    data['Year_cos'] = np.cos(2 * np.pi * (data['Year'] - data['Year'].min()) / year_range)
    return data

def add_lagged_features(data, lag_days=7):
    for lag in [lag_days]: 
        data[f'Wholesale_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Wholesale'].shift(lag)
        data[f'Retail_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Retail'].shift(lag)
        data[f'Supply_Volume_lag_{lag}'] = data.groupby(['County','Market', 'Classification'])['Supply Volume'].shift(lag)
    return data

def add_rolling_features(data, rolling_windows={'7d': 7}):
    data = data.sort_values(by=['Market', 'Classification', 'Date'])
    for window_name, window_size in rolling_windows.items():
        for column in ['Wholesale', 'Retail', 'Supply Volume']:
            data[f'{column}_rolling_mean_{window_name}'] = data.groupby(['Market', 'Classification'])[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
            data[f'{column}_rolling_std_{window_name}'] = data.groupby(['Market', 'Classification'])[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).std())
    return data.bfill().ffill()

def encode_categorical_features(data):
    columns_to_encode = ['County', 'Market', 'Classification']
    binary_encoder = ce.BinaryEncoder(cols=columns_to_encode, return_df=True)
    return binary_encoder.fit_transform(data)

def filter_columns_by_correlation(data, target_columns=['Retail', 'Wholesale'], threshold=0.1):
    correlation_matrix = data.corr()
    correlation_with_target = correlation_matrix[target_columns]
    filtered_columns = correlation_with_target[(correlation_with_target.abs() > threshold).any(axis=1)]
    filtered_column_names = filtered_columns.index.tolist()
    filtered_column_names = [col for col in filtered_column_names if col not in target_columns]
    final_data = data[filtered_column_names].copy()
    final_data = final_data.join(data[target_columns])
    return final_data

def export_modeling_data(data, output_file="modeling_data_2.csv"):
    data.to_csv(output_file, index=False)
    print(f"Data exported to {output_file}")

# Step 2: Define the pipeline
feature_engineering_pipeline = Pipeline([
    ('load_data', FunctionTransformer(load_data)),
    ('extract_time_features', FunctionTransformer(extract_time_features)),
    ('add_cyclic_features', FunctionTransformer(add_cyclic_features)),
    ('add_lagged_features', FunctionTransformer(add_lagged_features)),
    ('add_rolling_features', FunctionTransformer(add_rolling_features)),
    ('encode_categorical_features', FunctionTransformer(encode_categorical_features)),
    ('filter_columns_by_correlation', FunctionTransformer(filter_columns_by_correlation)),
    ('export_data', FunctionTransformer(export_modeling_data)),
])

# Step 3: API Endpoint to Trigger the Pipeline
@app.route('/process_data', methods=['POST'])
def process_data():
    try:
        # Run the pipeline
        feature_engineering_pipeline.fit_transform(None)
        
        # Return success message
        return jsonify({"message": "Data processing completed and saved as 'modeling_data_2.csv'."}), 200

    except Exception as e:
        # Handle any exceptions and return an error message
        return jsonify({"error": str(e)}), 500

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)


In [None]:
app = Flask(__name__)

# Load the pre-trained model
model_path = '../models/final_model.h5'
model = load_model(model_path)

# Define data preparation functions
def load_data(file_path):
    # Load raw data and perform initial cleaning steps if needed
    return pd.read_csv(file_path)

def extract_time_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Quarter'] = df['Date'].dt.quarter
    return df

def add_cyclic_features(df):
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
    df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)
    df['Quarter_sin'] = np.sin(2 * np.pi * df['Quarter'] / 4)
    df['Quarter_cos'] = np.cos(2 * np.pi * df['Quarter'] / 4)
    return df

def encode_categorical_features(df):
    encoder = ce.BinaryEncoder(cols=['County', 'Market', 'Classification'], return_df=True)
    return encoder.fit_transform(df)

# Pipeline for data preparation
data_preparation_pipeline = Pipeline([
    ('load_data', FunctionTransformer(load_data)),
    ('extract_time_features', FunctionTransformer(extract_time_features)),
    ('add_cyclic_features', FunctionTransformer(add_cyclic_features)),
    ('encode_categorical_features', FunctionTransformer(encode_categorical_features)),
])

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get data from request
        file_path = request.json.get('file_path')
        
        # Run data preparation pipeline
        prepared_data = data_preparation_pipeline.fit_transform(file_path)
        
        # Select the last 7 days of features for prediction
        n_timesteps = 7
        if len(prepared_data) < n_timesteps:
            return jsonify({"error": "Not enough data for the specified time steps"}), 400
        prediction_data = prepared_data[-n_timesteps:].values.reshape((1, n_timesteps, prepared_data.shape[1]))

        # Make predictions using the loaded model
        prediction = model.predict(prediction_data)
        
        return jsonify({"prediction": prediction[0][0]}), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)


In [None]:

app = Flask(__name__)

MODEL_PATH = '../models/final_model.h5'

# Helper function to prepare data from request
def prepare_data(data):
    """
    Converts input data from JSON format to a numpy array for the model.
    Expects a list of lists, where each inner list represents a feature set at a specific timestep.
    """
    return np.array(data).reshape(1, -1, len(data[0]))  # Reshape for LSTM (1, timesteps, features)


@app.route('/retrain', methods=['POST'])
def retrain_lstm_model():
    """
    Retrains the saved model with new training data.
    Expects JSON input with 'train_X', 'train_y', 'validation_X', and 'validation_y' keys.
    Each key's value should be a nested list structure representing data arrays.
    """
    try:
        data = request.get_json()

        # Convert input JSON data to numpy arrays
        train_X = np.array(data['train_X'])
        train_y = np.array(data['train_y']).reshape(-1, 1)
        validation_X = np.array(data['validation_X'])
        validation_y = np.array(data['validation_y']).reshape(-1, 1)

        # Load the saved model
        model = load_model(MODEL_PATH)

        # Callbacks for early stopping and learning rate reduction
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

        # Retrain the model
        model.fit(
            train_X, train_y,
            validation_data=(validation_X, validation_y),
            epochs=50,
            batch_size=32,
            callbacks=[early_stopping, lr_scheduler],
            verbose=2
        )

        # Save the retrained model
        model.save(MODEL_PATH)
        return jsonify({"message": "Model retrained and saved successfully."}), 200
    
    except Exception as e:
        return jsonify({"error": str(e)}), 400


@app.route('/predict', methods=['POST'])
def predict_with_saved_model():
    """
    Makes a prediction using the saved model.
    Expects JSON input with 'input_data' key, which should be a nested list representing a single input sequence.
    """
    try:
        data = request.get_json()
        input_data = prepare_data(data['input_data'])  # Reshape input for prediction

        # Load the saved model
        model = load_model(MODEL_PATH)

        # Make prediction
        prediction = model.predict(input_data)
        predicted_value = prediction[0][0]

        return jsonify({"prediction": float(predicted_value)}), 200
    
    except Exception as e:
        return jsonify({"error": str(e)}), 400


if __name__ == '__main__':
    app.run(debug=True)


In [None]:
app = Flask(__name__)

# Load the pre-trained model
model_path = '../models/final_model.h5'
model = keras.models.load_model(model_path)

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

def reshape_data(features: pd.DataFrame, n_timesteps: int = 7):
    """
    Reshape the incoming features using a sliding window approach.

    Parameters:
    - features (pd.DataFrame): The input features.
    - n_timesteps (int): The number of timesteps to use for reshaping.

    Returns:
    - reshaped_X (np.ndarray): Reshaped features.
    """
    reshaped_X = np.array([
        features.values[i:i + n_timesteps] 
        for i in range(len(features) - n_timesteps)
    ])
    return reshaped_X

@app.route('/predict', methods=['POST'])
def predict():
    """
    Endpoint to make predictions based on incoming data.
    """
    # Get the JSON data from the request
    data = request.get_json()

    # Convert JSON data to DataFrame
    features = pd.DataFrame(data)

    # Reshape the data
    n_timesteps = 7  # Adjust based on your model's requirements
    reshaped_X = reshape_data(features, n_timesteps)

    # Make predictions
    predictions = model.predict(reshaped_X)

    # Return predictions as JSON
    return jsonify(predictions.tolist())

@app.route('/retrain', methods=['POST'])
def retrain():
    """
    Endpoint to retrain the model with new incoming data.
    """
    # Get the JSON data from the request
    data = request.get_json()
    
    # Convert JSON data to DataFrame
    features = pd.DataFrame(data['features'])
    target = pd.Series(data['target'])

    # Reshape the incoming data
    n_timesteps = 7  # Adjust based on your model's requirements
    train_X, train_y = reshape_data(features, target)

    # Retrain the model
    global model  # Use the global model variable
    model.fit(
        train_X, train_y,
        validation_split=0.2,
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=2
    )

    # Save the retrained model
    model.save(model_path)

    return jsonify({"message": "Model retrained successfully."})

if __name__ == '__main__':
    app.run(debug=True)