Imports

In [1]:
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error
import numpy as np
from scipy.stats import chi2_contingency 


2024-04-05 11:42:13.707938: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 11:42:13.710357: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 11:42:13.738026: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC 
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
def load_and_explore_data(file_path):
    """Loads the sales data and performs basic EDA.

    Args:
        file_path (str): Path to the CSV file containing sales data.

    Returns:
        pandas.DataFrame: The loaded and preprocessed DataFrame.
    """
    data = pd.read_csv(path)
    
    # Basic EDA
    print(data.head())  # View first few rows
    print(data.info())  # Data types, missing values
    print(data.describe())  # Statistical summary

    return data 

# Example usage
path = '/home/calibraint/dynamic-dashboard/back-end/supermarket_sales.csv'
data = load_and_explore_data(path)


    Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Member    Male   
4  373-73-7910      A     Yangon        Normal    Male   

             Product line  Unit price  Quantity   Tax 5%     Total       Date  \
0       Health and beauty       74.69         7  26.1415  548.9715   1/5/2019   
1  Electronic accessories       15.28         5   3.8200   80.2200   3/8/2019   
2      Home and lifestyle       46.33         7  16.2155  340.5255   3/3/2019   
3       Health and beauty       58.22         8  23.2880  489.0480  1/27/2019   
4       Sports and travel       86.31         7  30.2085  634.3785   2/8/2019   

    Time      Payment    cogs  gross margin percentage  gross income  Rating  
0  13:08      Ewallet  522.83                 4.761905       26.1415     9.1  
1  10:

In [4]:
def identify_categorical_columns(data):
    """Identifies columns that are likely categorical.

    Args:
        data (pd.DataFrame): Input DataFrame.

    Returns:
        list: A list of column names considered categorical.
    """
    categorical_cols = data.select_dtypes(include='object').columns.tolist()
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        if data[col].nunique() < 30:  # Threshold for identifying potential categorical 
            categorical_cols.append(col)
    return categorical_cols


In [5]:
def preprocess_data(data):
    """Preprocesses data, including categorical feature encoding.

    Args:
        data (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    # Identify categorical columns
    categorical_cols = identify_categorical_columns(data)

    # One-hot encoding
    data = pd.get_dummies(data, columns=categorical_cols)
    '''
    # Date/Time feature engineering (Example)
    data['month'] = pd.to_datetime(data['Date']).dt.month  # Extract month
    '''
    return data

# Example Usage
data = pd.read_csv(path)
data = preprocess_data(data)  # Data is now preprocessed 
data.head()

Unnamed: 0,Unit price,Tax 5%,Total,cogs,gross income,Rating,Invoice ID_101-17-6199,Invoice ID_101-81-4070,Invoice ID_102-06-2002,Invoice ID_102-77-2261,...,Quantity_2,Quantity_3,Quantity_4,Quantity_5,Quantity_6,Quantity_7,Quantity_8,Quantity_9,Quantity_10,gross margin percentage_4.761904762
0,74.69,26.1415,548.9715,522.83,26.1415,9.1,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
1,15.28,3.82,80.22,76.4,3.82,9.6,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,46.33,16.2155,340.5255,324.31,16.2155,7.4,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
3,58.22,23.288,489.048,465.76,23.288,8.4,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,86.31,30.2085,634.3785,604.17,30.2085,5.3,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True


In [None]:
def find_potential_targets(data):
    """Identifies columns that are likely candidates for the target variable.

    Args:
        data (pd.DataFrame): Input DataFrame (preprocessed).

    Returns:
        list: A list of potential target column names.
    """

    num_cols = data.select_dtypes(include='number').columns

    potential_targets = []
    for col in num_cols:
        # Heuristic 1: Spread of values
        if data[col].std() > 0.2:  # Threshold for meaningful standard deviation
            potential_targets.append(col)

        # Heuristic 2: Correlations with other features
        correlations = abs(data.corr()[col]) 
        num_moderate_correlations = (correlations > 0.3).sum() 
        if num_moderate_correlations >= 3:  
            potential_targets.append(col)           

    return list(set(potential_targets))  # Return unique targets

In [6]:
find_potential_targets(data)

NameError: name 'find_potential_targets' is not defined

In [None]:
def select_best_target(data, potential_targets):
    scores = {}
    for col in potential_targets:
        scores[col] = 0

        # Target: Continuous 
        if data[col].nunique() > 20 and 0.4 < data[col].std() < 1.5: 
            scores[col] += 2  

        # Target: Categorical (many levels)
        if data[col].dtype == 'object' and data[col].nunique() / len(data) > 0.3: 
            scores[col] += 1 

    # Secondary Heuristics (Refined)
    for col in potential_targets.copy():  
        if data[col].isna().sum() > 0.1 * len(data):  # Missing value threshold
            potential_targets.remove(col)

        if data[col].dtype == 'object' and 'known_related_feature' in data.columns:  
            for group in data['known_related_feature'].unique(): 
                contingency_table = pd.crosstab(data[group], data[col])
                chi2, p, _, _ = chi2_contingency(contingency_table)
                if p > 0.05:  
                    potential_targets.remove(col)
                    break  

    if potential_targets:
        best_target = max(scores, key=scores.get) 
        return best_target
    else:
        return None 

In [None]:
best_target = select_best_target(data, potential_targets)
best_target

'Unit price'

In [None]:
if best_target:
    selected_features = select_features_correlation(data, best_target)
    print("Target Column:", best_target)
    print("Selected Features:", selected_features)

    # ... (Proceed with your modeling using 'best_target' and 'selected_features') ...
else:
    print("Automated target selection unsure. Please review or use a fallback strategy.")

Target Column: Unit price
Selected Features: ['Unit price', 'Tax 5%', 'Total', 'cogs', 'gross income']


In [None]:
def determine_target_type(target_column):
    """Determines the type of a target column (continuous, categorical, etc.)"""

    if target_column.dtype == 'float' or target_column.dtype == 'int':
        # Potential Continuous Target
        if target_column.nunique() >= 20:  # Threshold for 'many' unique values
            return "continuous"
        else:
            return "categorical_few" 

    elif target_column.dtype == 'object':  # Likely Categorical
        if target_column.nunique() / len(target_column) <= 0.05:  #  Few unique values
            return "categorical_binary"  
        else:
            return "categorical_many"

    else:
        return "unsupported_type"  


In [None]:
def select_model(data, best_target):
    target_type = determine_target_type(data[best_target])  

    model_map = {
        "continuous": "linear_regression",  # Single model
        "categorical_binary": "logistic_regression", 
        "categorical_many": "xgboost" 
    }

    return model_map.get(target_type, "default_model")  


In [None]:
select_model(data, best_target)

['linear_regression', 'random_forest']

In [None]:
def create_model(model_type):
    if model_type == "linear_regression":
        return LinearRegression()
    elif model_type == "logistic_regression":
        return LogisticRegression()
    elif model_type == "decision_tree":
        return DecisionTreeClassifier()  # Assuming classification
    elif model_type == "xgboost":
        return XGBRegressor()  # Or XGBClassifier for classification
    else:
        raise ValueError("Unsupported model type")


In [None]:
def select_and_evaluate_models(data, target_column, cv=5):
    results = {} 
    best_params = {}  # Initialize here

    target_type = determine_target_type(data[best_target]) 
    selected_model = select_model(data, target_column)  
    
    model = create_model(selected_model) 
    param_grid = define_param_grid(selected_model)  

    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='neg_mean_squared_error')  
    grid_search.fit(data.drop(target_column, axis=1), data[target_column])

    results[selected_model] = grid_search.best_score_
    best_params[selected_model] = grid_search.best_params_ # Store here

    return results, best_params    

In [None]:
def define_param_grid(model_type):
    param_grid = {}
    if model_type == "linear_regression":
        param_grid = {"fit_intercept": [True, False]} 
    elif model_type == "random_forest":
        param_grid = {
            "n_estimators": [50, 100, 200], 
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10] 
        }
    elif model_type == "logistic_regression":
        param_grid = {
            "penalty": ['l1', 'l2'],
            "C": [0.01, 0.1, 1, 10], 
            "solver": ['liblinear', 'saga']
        }
    elif model_type == "xgboost":
        param_grid = {
            "learning_rate": [0.01, 0.1, 0.3],
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 8]
        } 
    return param_grid

In [None]:
target_type = determine_target_type(data[best_target])
print(target_type)

continuous


In [None]:
results, best_params = select_and_evaluate_models(data, best_target)
best_model_type = max(results, key=results.get)

In [None]:
print("Best Model:", best_model_type) 

Best Model: linear_regression


In [None]:
print("Best Parameters:", best_params[best_model_type])

Best Parameters: {'fit_intercept': False}


LOADING THE BEST MODEL

In [None]:
def load_best_model(best_model_type, best_params):
    if best_model_type == "linear_regression":
        return LinearRegression(fit_intercept=best_params['fit_intercept'])  
    elif best_model_type == "logistic_regression":
        return LogisticRegression(**best_params) # Potentially adjust parameters 
    elif best_model_type == "xgboost": 
        return XGBRegressor(**best_params)  # Or XGBClassifier, adjust parameters
    else:
        raise ValueError("Unsupported model type")


SAVING THE MODEL INFO AS A PICKLE

In [None]:
import pickle
with open('model_info.pkl', 'wb') as f:
        pickle.dump({'model_type': best_model_type, 'params': best_params}, f)

LOADING MODEL INFORMATION

In [None]:
with open('model_info.pkl', 'rb') as f:
        model_info = pickle.load(f)
best_model_type = model_info['model_type']
best_params = model_info['params']

In [None]:
best_model = load_best_model(best_model_type, best_params)

KeyError: 'fit_intercept'

In [None]:

predictions = predict_on_new_data(best_model, new_data, best_target) 
print("Predictions:", predictions)