In [10]:
# train_models.py

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd



In [11]:

# Load the preprocessed data
data = pd.read_csv('../../data/processed_data.csv')

In [12]:
print(data.columns)

Index(['item_code', 'invoice_num', 'invoice_time', 'item_qty',
       'item_sub_segment', 'item_segment', 'item_category',
       'item_sub_department', 'item_department', 'Primary Key'],
      dtype='object')


In [13]:
data.head()

Unnamed: 0,item_code,invoice_num,invoice_time,item_qty,item_sub_segment,item_segment,item_category,item_sub_department,item_department,Primary Key
0,1080988,4989509,2022-03-11 09:37:02+00:00,1.0,Packets,Hen Eggs,Eggs,Eggs,Grocery,1080988_2022-03-11_9
1,1080988,2125081,2022-03-11 18:56:29+00:00,1.0,Packets,Hen Eggs,Eggs,Eggs,Grocery,1080988_2022-03-11_18
2,1080988,9370252,2022-03-11 18:21:58+00:00,1.0,Packets,Hen Eggs,Eggs,Eggs,Grocery,1080988_2022-03-11_18
3,1080988,Z003000,2022-03-11 13:41:34+00:00,4.0,Packets,Hen Eggs,Eggs,Eggs,Grocery,1080988_2022-03-11_13
4,1080988,5830278,2022-03-11 19:11:50+00:00,1.0,Packets,Hen Eggs,Eggs,Eggs,Grocery,1080988_2022-03-11_19


In [14]:
data['item_code'].head()

0    1080988
1    1080988
2    1080988
3    1080988
4    1080988
Name: item_code, dtype: int64

In [15]:
non_numeric_values = data.apply(pd.to_numeric, errors='coerce').isnull().sum()
print(non_numeric_values)


item_code                   0
invoice_num             11302
invoice_time           352751
item_qty                    0
item_sub_segment       352751
item_segment           352751
item_category          352751
item_sub_department    352751
item_department        352751
Primary Key            352751
dtype: int64


In [16]:
label_encoder = LabelEncoder()
categorical_columns = ['item_sub_segment', 'item_segment', 'item_category', 'item_sub_department', 'item_department','invoice_num']
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])
# Preprocess the 'invoice_time' column
data['invoice_time'] = pd.to_datetime(data['invoice_time'])
data['year'] = data['invoice_time'].dt.year
data['month'] = data['invoice_time'].dt.month
data['day'] = data['invoice_time'].dt.day
data['hour'] = data['invoice_time'].dt.hour
data['day_of_week'] = data['invoice_time'].dt.dayofweek  # 0 = Monday, 6 = Sunday


In [17]:
label_encoder = LabelEncoder()
data['item_code'] = label_encoder.fit_transform(data['item_code'])

# Take a smaller subset of the data, for example, the first 5000 rows
data = data.iloc[:5000]

def split_data(data, target_column, test_size=0.2, random_state=42):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Define the target variable
target_variable = 'item_qty'
features = ['year', 'month', 'day', 'hour', 'day_of_week']  # Include other features as needed

X = data[features]
y = data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.2832420019771496


In [19]:
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)


Intercept: 1.1457590405500124
Coefficients: [ 0.         -0.00775383 -0.00079061 -0.00137423 -0.0139252 ]
