In [206]:
# NOTE: This notebook was used to train and save LR.pkl for Streamlit app deployment.

In [207]:
import pickle
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


In [208]:
customer = pd.read_csv('Ecommerce_LinReg.xls')
customer.columns

Index(['Email', 'Address', 'Avatar', 'Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership', 'Yearly Amount Spent'],
      dtype='object')

In [209]:
with open("LR.pkl", "rb") as f:
    model = pickle.load(f)

expected_features = model.feature_names_in_
expected_features

array(['Avg. Session Length', 'Time on App', 'Time on Website',
       'Length of Membership'], dtype=object)

In [210]:
target_column = 'Yearly Amount Spent'
target_column

'Yearly Amount Spent'

In [211]:
x = customer.drop(columns = [target_column] )
x

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308
...,...,...,...,...,...,...,...
495,lewisjessica@craig-evans.com,"4483 Jones Motorway Suite 872\nLake Jamiefurt,...",Tan,33.237660,13.566160,36.417985,3.746573
496,katrina56@gmail.com,"172 Owen Divide Suite 497\nWest Richard, CA 19320",PaleVioletRed,34.702529,11.695736,37.190268,3.576526
497,dale88@hotmail.com,"0787 Andrews Ranch Apt. 633\nSouth Chadburgh, ...",Cornsilk,32.646777,11.499409,38.332576,4.958264
498,cwilson@hotmail.com,"680 Jennifer Lodge Apt. 808\nBrendachester, TX...",Teal,33.322501,12.391423,36.840086,2.336485


In [212]:
x = x.drop(columns=['Email','Address','Avatar'], errors = 'ignore')
x

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
0,34.497268,12.655651,39.577668,4.082621
1,31.926272,11.109461,37.268959,2.664034
2,33.000915,11.330278,37.110597,4.104543
3,34.305557,13.717514,36.721283,3.120179
4,33.330673,12.795189,37.536653,4.446308
...,...,...,...,...
495,33.237660,13.566160,36.417985,3.746573
496,34.702529,11.695736,37.190268,3.576526
497,32.646777,11.499409,38.332576,4.958264
498,33.322501,12.391423,36.840086,2.336485


In [213]:
y = customer[target_column]
y

0      587.951054
1      392.204933
2      487.547505
3      581.852344
4      599.406092
          ...    
495    573.847438
496    529.049004
497    551.620145
498    456.469510
499    497.778642
Name: Yearly Amount Spent, Length: 500, dtype: float64

In [214]:
test_sample = x.iloc[[0]]
test_sample

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
0,34.497268,12.655651,39.577668,4.082621


In [215]:
def load_model_predict(input_data): 
    """ Loads the LR model from the disk and makes a prediction
    """
    file_name = 'LR.pkl'

    try: 
        # Open the file in the 'rb' read binary mode. 
        with open('LR.pkl' , 'rb') as f:
            loaded_model = pickle.load(f)
            
        ## Ensure correct feature alignment 
        input_data = input_data[loaded_model.feature_names_in_]
        
        ## Make a Prediction
        prediction = loaded_model.predict(input_data)
        return prediction

    except FileNotFoundError:
        print(f'❌ File {file_name} you are looking for is not found')
        return None 
    except Exception as e:
        print(f' ⚠️ The file is not loaded properly and hence an error occured:{e}')
        return None    

In [216]:
result = load_model_predict(test_sample)
print(f"Predicted Yearly Amount Spent: ${result[0]:,.2f}")

Predicted Yearly Amount Spent: $594.20


In [217]:
test_sample.to_csv("test_sample.csv", index=False)

In [218]:
pd.read_csv('test_sample.csv' ).head()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
0,34.497268,12.655651,39.577668,4.082621


In [219]:
# Reload the same sample
saved_sample = pd.read_csv("test_sample.csv")

# Predict again using your function
result2 = load_model_predict(saved_sample)
print(f"Predicted Yearly Amount Spent (reloaded): ${result2[0]:.2f}")

Predicted Yearly Amount Spent (reloaded): $594.20


In [220]:
# Pick 5 random customers from your dataset. (Batch Prediction)
test_samples1 = df.sample(5, random_state =33)
test_samples1

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
153,31.945396,12.965761,36.966389,6.076654
483,33.606851,12.214074,37.198428,2.905238
27,32.33599,13.007819,37.851779,2.996365
50,33.256335,13.858062,37.780265,5.976768
442,34.083663,8.66835,35.906756,2.252446


In [245]:
# Batch Prediction
result3 = load_model_predict(test_samples1)
batch_input = test_samples1
test_samples1['Pred_results'] = result3
print(test_samples1)

     Avg. Session Length  Time on App  Time on Website  Length of Membership  \
153            31.945396    12.965761        36.966389              6.076654   
483            33.606851    12.214074        37.198428              2.905238   
27             32.335990    13.007819        37.851779              2.996365   
50             33.256335    13.858062        37.780265              5.976768   
442            34.083663     8.668350        35.906756              2.252446   

     Pred_results  
153    663.359750  
483    481.004186  
27     484.870137  
50     725.549590  
442    315.392617  


In [249]:
## Save the Batch_input and Batch_Predictions_sample
batch_input.to_csv('Batch_Input.csv', index=False)
print('✅Saved to Batch_Input.csv')
test_samples1.to_csv('Batch_predictions.csv', index=False)
print('✅Saved to Batch_Predictions.csv')

✅Saved to Batch_Input.csv
✅Saved to Batch_Predictions.csv


In [223]:
## Evaluate Model Accuracy through R^2 (MSE)
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score
x = df
y = customer["Yearly Amount Spent"]
predicted_values  = load_model_predict(x)
print(f'MSE ={mean_squared_error( predicted_values , y ): .2f}')
print(f'MAE ={mean_absolute_error( predicted_values , y ): .2f}')
print(f'R^2 ={r2_score( predicted_values , y ): .2f}')

MSE = 98.58
MAE = 7.89
R^2 = 0.98
