In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [3]:
data = pd.read_csv('/mnt/home/kodumuru/ITM891_Parimal/ITM891_PROJECT/SAP/DL/GData.csv')


In [4]:
#show me the null values
print(data.isnull().sum())


X_CASE_KEY                            0
EBELN                                 0
EBELP                                 0
createtime                            0
firstreceivetime                      0
changeconfirmeddeliverydate           0
changecontract                        0
changecurrency                        0
changedeliveryindicator               0
changefinalinvoiceindicator           0
changeoutwarddeliveryindicator        0
changeprice                           0
changequantity                        0
changerequesteddeliverydate           0
changestoragelocation                 0
numdelivery                           0
GDdays                                4
BUKRS                                 0
MATKL                                 0
MATNR                             56390
NETPR                                 0
PSTYP                                 0
WERKS                                 0
ERNAM                             39073
dtype: int64


In [5]:
#drop the null value column of MATNR
data = data.dropna(subset=['MATNR'])


In [6]:
#fill ERNAM with nan
data['ERNAM'] = data['ERNAM'].fillna('Unknown')


In [7]:
#show me the columns with null values
print(data.isnull().sum())



X_CASE_KEY                        0
EBELN                             0
EBELP                             0
createtime                        0
firstreceivetime                  0
changeconfirmeddeliverydate       0
changecontract                    0
changecurrency                    0
changedeliveryindicator           0
changefinalinvoiceindicator       0
changeoutwarddeliveryindicator    0
changeprice                       0
changequantity                    0
changerequesteddeliverydate       0
changestoragelocation             0
numdelivery                       0
GDdays                            4
BUKRS                             0
MATKL                             0
MATNR                             0
NETPR                             0
PSTYP                             0
WERKS                             0
ERNAM                             0
dtype: int64


In [8]:
#fill GDdays missing values with mean
data['GDdays'] = data['GDdays'].fillna(data['GDdays'].mean())


In [9]:
#show me the null values
print(data.isnull().sum())

X_CASE_KEY                        0
EBELN                             0
EBELP                             0
createtime                        0
firstreceivetime                  0
changeconfirmeddeliverydate       0
changecontract                    0
changecurrency                    0
changedeliveryindicator           0
changefinalinvoiceindicator       0
changeoutwarddeliveryindicator    0
changeprice                       0
changequantity                    0
changerequesteddeliverydate       0
changestoragelocation             0
numdelivery                       0
GDdays                            0
BUKRS                             0
MATKL                             0
MATNR                             0
NETPR                             0
PSTYP                             0
WERKS                             0
ERNAM                             0
dtype: int64


# Step 3: Convert Date-Time Columns

In [10]:
date_columns = ['createtime', 'firstreceivetime', 'changeconfirmeddeliverydate', 'changerequesteddeliverydate']
for col in date_columns:
    data[col] = pd.to_datetime(data[col])
    data[col] = data[col].astype(int) / 10**9  # convert to seconds since epoch


In [11]:
# Setup transformers for numerical and categorical data
numeric_features = ['EBELP', 'numdelivery', 'NETPR'] + date_columns  # add other numeric columns as necessary
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['EBELN', 'changecontract', 'changecurrency', 'changedeliveryindicator', 
                        'changefinalinvoiceindicator', 'changeoutwarddeliveryindicator', 'changeprice', 
                        'changequantity', 'changestoragelocation', 'BUKRS', 'MATKL', 'MATNR', 'PSTYP', 'WERKS', 'ERNAM']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [12]:
# Define features and target
X = data.drop('GDdays', axis=1)  # assuming 'GDdays' is the target
y = data['GDdays']

# Apply the preprocessing pipeline to the features
X_preprocessed = preprocessor.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.utils import to_categorical



# Assuming 'GDdays' is your target variable
features = ['EBELN', 'EBELP', 'createtime', 'firstreceivetime', 'changeconfirmeddeliverydate', 
            'changecontract', 'changecurrency', 'changedeliveryindicator', 'changefinalinvoiceindicator', 
            'changeoutwarddeliveryindicator', 'changeprice', 'changequantity', 'changerequesteddeliverydate', 
            'changestoragelocation', 'numdelivery', 'BUKRS', 'MATKL', 'MATNR', 'NETPR', 'PSTYP', 'WERKS', 'ERNAM']
target = 'GDdays'

# Convert date-time columns to numerical by extracting UNIX timestamp
date_cols = ['createtime', 'firstreceivetime', 'changeconfirmeddeliverydate', 'changerequesteddeliverydate']
for col in date_cols:
    data[col] = pd.to_datetime(data[col]).astype(int) / 10**9

# Preprocessing for numerical data
numeric_features = ['EBELP', 'NETPR', 'numdelivery'] + date_cols
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_features = ['EBELN', 'changecontract', 'changecurrency', 'changedeliveryindicator', 
                        'changefinalinvoiceindicator', 'changeoutwarddeliveryindicator', 'changeprice', 
                        'changequantity', 'changestoragelocation', 'BUKRS', 'MATKL', 'MATNR', 'PSTYP', 'WERKS', 'ERNAM']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into train and test sets
X = data[features]
y = data[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply transformations
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Ensure input is 3D for LSTM
X_train = X_train.toarray().reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.toarray().reshape((X_test.shape[0], 1, X_test.shape[1]))


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  Xt = transform.transform(Xt)


# Step 2: Building and Training the LSTM Model

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(30, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)


Train on 16017 samples, validate on 4005 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
import shap

# It's good practice to use a background dataset to approximate the expected value
background = X_train[np.random.choice(X_train.shape[0], 100, replace=False)]

# SHAP Deep Explainer
explainer = shap.DeepExplainer(model, background)
shap_values = explainer.shap_values(X_test[:10])  # Explain a subset of predictions

# Plot the SHAP values for the first prediction
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features=X_test[0], feature_names=features)



AttributeError: module 'numba' has no attribute 'core'