# ARIMA

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
data_path = "/content/df_PREPROCESSED.csv"
df = pd.read_csv(data_path)



In [None]:
df.columns

Index(['Unnamed: 0', 'ID', 'date', 'Item Id', 'Item Name', 'ad_spend',
       'anarix_id', 'units', 'unit_price', 'orderedrevenueamount', 'ROAS',
       'day_of_week', 'week', 'month', 'quarter', 'cost_category'],
      dtype='object')

In [None]:
item_name_imputer = SimpleImputer(strategy='most_frequent')
df['Item Name'] = item_name_imputer.fit_transform(df['Item Name'].values.reshape(-1, 1)).flatten()
ad_spend_imputer = SimpleImputer(strategy='median')
df['ad_spend'] = ad_spend_imputer.fit_transform(df['ad_spend'].values.reshape(-1, 1)).flatten()

knn_imputer = KNNImputer(n_neighbors=5)
df[['ad_spend', 'unit_price']] = knn_imputer.fit_transform(df[['ad_spend', 'unit_price']]) # KNNImputer handles multiple columns correctly

In [None]:

numeric_cols = ['ad_spend', 'unit_price',  'units']
df[numeric_cols] = df[numeric_cols].astype(float)



In [None]:
df.columns

Index(['Unnamed: 0', 'ID', 'date', 'Item Id', 'Item Name', 'ad_spend',
       'anarix_id', 'units', 'unit_price', 'orderedrevenueamount', 'ROAS',
       'day_of_week', 'week', 'month', 'quarter', 'cost_category'],
      dtype='object')

In [None]:

train_size = int(0.8 * len(df))
train, test = df.iloc[:train_size], df.iloc[train_size:]


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings("ignore")
p = 1
d = 1
q = 1


arima_model = SARIMAX(train['units'], order=(p, d, q))
arima_result = arima_model.fit()


print(arima_result.summary())


                               SARIMAX Results                                
Dep. Variable:                  units   No. Observations:                80373
Model:               SARIMAX(1, 1, 1)   Log Likelihood             -452814.345
Date:                Fri, 02 Aug 2024   AIC                         905634.690
Time:                        15:27:14   BIC                         905662.573
Sample:                             0   HQIC                        905643.236
                              - 80373                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0880      0.000    213.068      0.000       0.087       0.089
ma.L1         -0.9962      0.000  -5379.225      0.000      -0.997      -0.996
sigma2      4582.6631      0.779   5879.535      0.0

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


start = len(train)
end = len(train) + len(test) - 1
predictions = arima_result.predict(start=start, end=end, dynamic=False)

mse = mean_squared_error(test['units'], predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test['units'], predictions)
r2 = r2_score(test['units'], predictions)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R2 Score: {r2}')


Root Mean Squared Error (RMSE): 35.32235463277331
Mean Squared Error (MSE): 1247.668736803402
Mean Absolute Error (MAE): 9.521394898266374
R2 Score: -0.009068256063791802


In [None]:

test_data_path = "/content/test.csv"
test_df = pd.read_csv(test_data_path)


test_df['Item Name'] = item_name_imputer.transform(test_df[['Item Name']].values.reshape(-1, 1)).flatten()
test_df['ad_spend'] = ad_spend_imputer.transform(test_df[['ad_spend']].values.reshape(-1, 1)).flatten()
test_df[['ad_spend', 'unit_price']] = knn_imputer.transform(test_df[['ad_spend', 'unit_price']])





test_df = test_df.sort_values('date')
test_df.set_index('date', inplace=True)

future_predictions = arima_result.predict(start=len(df), end=len(df) + len(test_df) - 1)
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'TARGET': future_predictions.values
})

submission.to_csv('submission_arima.csv', index=False)


# LSTM & GRU

## LSTM

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

data_path = "/content/df_PREPROCESSED.csv"
df = pd.read_csv(data_path)

item_name_imputer = SimpleImputer(strategy='most_frequent')
df['Item Name'] = item_name_imputer.fit_transform(df[['Item Name']].values.reshape(-1, 1)).flatten()

ad_spend_imputer = SimpleImputer(strategy='median')
df['ad_spend'] = ad_spend_imputer.fit_transform(df[['ad_spend']].values.reshape(-1, 1)).flatten()

knn_imputer = KNNImputer(n_neighbors=5)
df[['ad_spend', 'unit_price']] = knn_imputer.fit_transform(df[['ad_spend', 'unit_price']])

df['ad_spend_per_unit'] = df['ad_spend'] / df['units']
df['revenue_per_unit'] = df['orderedrevenueamount'] / df['units']

numeric_cols = ['ad_spend', 'unit_price', 'orderedrevenueamount', 'ROAS', 'ad_spend_per_unit', 'revenue_per_unit', 'units']
df[numeric_cols] = df[numeric_cols].astype(float)

df = df.sort_values('date')
df.set_index('date', inplace=True)

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)


In [None]:
from sklearn.preprocessing import MinMaxScaler
features = df[['ad_spend', 'unit_price']].values
target = df['units'].values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(features)
def create_sequences(data, target, seq_length):
    X = []
    y = []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(target[i + seq_length])
    return np.array(X), np.array(y)
seq_length = 10
X, y = create_sequences(scaled_features, target, seq_length)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)


Epoch 1/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - loss: 8332.7764 - val_loss: 10590.3623
Epoch 2/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 11673.6777 - val_loss: 10594.5020
Epoch 3/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - loss: 6922.0645 - val_loss: 10589.5254
Epoch 4/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 7127.6045 - val_loss: 10590.5361
Epoch 5/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 12321.7002 - val_loss: 10598.5967
Epoch 6/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - loss: 12066.7363 - val_loss: 10589.4316
Epoch 7/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14ms/step - loss: 10219.9150 - val_loss: 10597.1387
Epoch 8/20
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m

In [None]:

loss = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on test data: {loss}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2684.2102
Mean Squared Error on test data: 2003.3486328125


In [None]:

predictions = model.predict(X_test)

predictions = scaler.inverse_transform(np.concatenate((predictions, np.zeros((predictions.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]
y_test_inverse = scaler.inverse_transform(np.concatenate((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_inverse, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, predictions)
r2 = r2_score(y_test_inverse, predictions)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R2 Score: {r2}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Root Mean Squared Error (RMSE): 2145511.414096443
Mean Squared Error (MSE): 4603219228018.119
Mean Absolute Error (MAE): 897936.2242565965
R2 Score: -0.0026469109638107557


## Bi LSTM

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np
data_path = "/content/df_PREPROCESSED.csv"
df = pd.read_csv(data_path)


item_name_imputer = SimpleImputer(strategy='most_frequent')
df['Item Name'] = item_name_imputer.fit_transform(df[['Item Name']].values.reshape(-1, 1)).flatten()
ad_spend_imputer = SimpleImputer(strategy='median')
df['ad_spend'] = ad_spend_imputer.fit_transform(df[['ad_spend']].values.reshape(-1, 1)).flatten()
knn_imputer = KNNImputer(n_neighbors=5)
df[['ad_spend', 'unit_price']] = knn_imputer.fit_transform(df[['ad_spend', 'unit_price']])


numeric_cols = ['ad_spend', 'unit_price', 'orderedrevenueamount', 'ROAS']
df[numeric_cols] = df[numeric_cols].astype(float)
df = df.sort_values('date')
df.set_index('date', inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np


features = df[['ad_spend', 'unit_price']].values
target = df['units'].values


scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(features)


def create_sequences(data, target, seq_length):
    X = []
    y = []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(target[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 10
X, y = create_sequences(scaled_features, target, seq_length)


train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional


model = Sequential()
model.add(Bidirectional(LSTM(units=50, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50)))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')


history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)


Epoch 1/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 24ms/step - loss: 7289.6768 - val_loss: 8444.4609
Epoch 2/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 23ms/step - loss: 2962.8770 - val_loss: 8396.1777
Epoch 3/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 25ms/step - loss: 5744.2231 - val_loss: 8455.6191
Epoch 4/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 23ms/step - loss: 3391.2961 - val_loss: 8429.2939
Epoch 5/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 23ms/step - loss: 4821.7031 - val_loss: 8437.2686
Epoch 6/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 23ms/step - loss: 7344.6113 - val_loss: 8436.6689
Epoch 7/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - loss: 5074.8281 - val_loss: 8444.2246
Epoch 8/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step 

In [None]:

loss = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on test data: {loss}')


predictions = model.predict(X_test)


predictions = scaler.inverse_transform(np.concatenate((predictions, np.zeros((predictions.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]
y_test_inverse = scaler.inverse_transform(np.concatenate((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_inverse, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, predictions)
r2 = r2_score(y_test_inverse, predictions)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R2 Score: {r2}')


[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 1669.5763
Mean Squared Error on test data: 1576.6368408203125
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
Root Mean Squared Error (RMSE): 1903348.2061188722
Mean Squared Error (MSE): 3622734393735.9287
Mean Absolute Error (MAE): 604193.8949234871
R2 Score: -0.001642189040704789


## Bi GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Bidirectional


model = Sequential()
model.add(Bidirectional(GRU(units=50, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(units=50)))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')


history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)


Epoch 1/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 29ms/step - loss: 6653.3125 - val_loss: 8437.5811
Epoch 2/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 27ms/step - loss: 4157.7788 - val_loss: 8446.1152
Epoch 3/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 26ms/step - loss: 4667.1235 - val_loss: 8433.4326
Epoch 4/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 27ms/step - loss: 5459.6128 - val_loss: 8437.9717
Epoch 5/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 27ms/step - loss: 6985.6895 - val_loss: 8435.7949
Epoch 6/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 27ms/step - loss: 5649.3086 - val_loss: 8447.9102
Epoch 7/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 29ms/step - loss: 4295.7715 - val_loss: 8446.3721
Epoch 8/20
[1m1554/1554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 28ms/step 

In [None]:

loss = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on test data: {loss}')


predictions = model.predict(X_test)


predictions = scaler.inverse_transform(np.concatenate((predictions, np.zeros((predictions.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]
y_test_inverse = scaler.inverse_transform(np.concatenate((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], scaled_features.shape[1] - 1))), axis=1))[:, 0]


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_inverse, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, predictions)
r2 = r2_score(y_test_inverse, predictions)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R2 Score: {r2}')



[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 1671.4817
Mean Squared Error on test data: 1579.9788818359375
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step
Root Mean Squared Error (RMSE): 1905364.41845312
Mean Squared Error (MSE): 3630413567107.1963
Mean Absolute Error (MAE): 597320.3522312896
R2 Score: -0.003765387484101268


# Hyper Parameter

In [None]:
pip install keras_tuner


Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras_tuner import Hyperband


In [None]:
import pandas as pd

data_path = "/content/df_PREPROCESSED.csv"
try:
    df = pd.read_csv(data_path, on_bad_lines='warn')  # Warn on problematic lines
except pd.errors.ParserError as e:
    print(f"Error occurred: {e}")
    problematic_row_index = int(str(e).split('row ')[-1])  # Extract row number from error message
    with open(data_path, 'r') as file:
        for i, line in enumerate(file):
            if i == problematic_row_index:
                print(f"Problematic row ({i}): {line}")
                break

In [None]:



numeric_cols = ['ad_spend', 'unit_price', 'units']
df[numeric_cols] = df[numeric_cols].astype(float)

# Normalize
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[numeric_cols])

def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [25]:
def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=256, step=32),
                   input_shape=(X_train.shape[1], X_train.shape[2]),
                   return_sequences=True))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=256, step=32)))
    model.add(Dense(1))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                  loss='mean_squared_error')
    return model

tuner = Hyperband(build_model,
                  objective='val_loss',
                  max_epochs=10,
                  hyperband_iterations=2,
                  directory='my_dir',
                  project_name='helloworld')

tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
best_model = tuner.get_best_models(num_models=1)[0]

mse = best_model.evaluate(X_test, y_test)
print(f'Mean Squared Error: {mse}')


Trial 42 Complete [00h 04m 25s]
val_loss: 7.490970892831683e-05

Best val_loss So Far: 7.431471021845937e-05
Total elapsed time: 04h 46m 43s

Search: Running Trial #43

Value             |Best Value So Far |Hyperparameter
256               |224               |units
0.1               |0.1               |dropout_rate
0.0003254         |0.00010393        |learning_rate
4                 |10                |tuner/epochs
2                 |4                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
1                 |2                 |tuner/round
0035              |0013              |tuner/trial_id

Epoch 3/4
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 109ms/step - loss: 2.8435e-05 - val_loss: 7.4748e-05
Epoch 4/4
[1m 527/2010[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m2:10[0m 88ms/step - loss: 1.8659e-04

KeyboardInterrupt: 

In [None]:
def build_model(hp):
    model = Sequential()
    units = hp.Int('units', min_value=32, max_value=128, step=32) # Reduced max value
    model.add(LSTM(units=units,
                   input_shape=(X_train.shape[1], X_train.shape[2]),
                   return_sequences=True))
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3, step=0.1) # Reduced max value
    model.add(Dropout(rate=dropout_rate))
    model.add(LSTM(units=units))
    model.add(Dense(1))

    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-3, sampling='LOG') # Reduced max value
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    print("Chosen hyperparameters:")
    print(f"Units: {units}, Dropout Rate: {dropout_rate}, Learning Rate: {learning_rate}")

    return model
tuner = Hyperband(build_model,
                  objective='val_loss',
                  max_epochs=10,
                  hyperband_iterations=2,
                  directory='my_dir',
                  project_name='helloworld')

tuner.search(X_train, y_train, epochs=10, validation_split=0.2)


# Py Spark Stacking

In [1]:
pip install pyspark



In [52]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StringType
from pyspark.sql.functions import to_date

spark = SparkSession.builder \
    .appName("StackingModel") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()


data_path = "/content/df.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)


df = df.withColumn("units", when(col("units") < 0, lit(0)).otherwise(col("units")))
df = df.withColumn("date", to_date(col("date")))  # Assuming 'date' is in a format that can be parsed to date
df = df.withColumn("date", col("date").cast(StringType()))

numeric_columns = ["ad_spend", "unit_price", "units"]
for column in numeric_columns:
    df = df.withColumn(column, col(column).cast("double"))


train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# ID|      date|   Item Id|           Item Name|ad_spend|anarix_id|unit_price
categorical_features = ['ID','date','Item Id', 'Item Name', 'anarix_id']
numeric_features = ['ad_spend', 'unit_price']



indexers = [StringIndexer(inputCol=feature, outputCol=feature + "_index", handleInvalid='keep') for feature in categorical_features]
encoders = [OneHotEncoder(inputCol=feature + "_index", outputCol=feature + "_ohe") for feature in categorical_features]


assembler = VectorAssembler(
    inputCols=numeric_features + [feature + "_ohe" for feature in categorical_features],
    outputCol="features",
    handleInvalid="skip"
)


rf_regressor = RandomForestRegressor(featuresCol="features", labelCol="units")
gb_regressor = GBTRegressor(featuresCol="features", labelCol="units")
dt_regressor = DecisionTreeRegressor(featuresCol="features", labelCol="units")


pipeline_rf = Pipeline(stages=indexers + encoders + [assembler, rf_regressor])
pipeline_gb = Pipeline(stages=indexers + encoders + [assembler, gb_regressor])
pipeline_dt = Pipeline(stages=indexers + encoders + [assembler, dt_regressor])

print(f"Rows in test_data beofre model : {test_data.count()}")


rf_model = pipeline_rf.fit(train_data)
gb_model = pipeline_gb.fit(train_data)
dt_model = pipeline_dt.fit(train_data)


rf_predictions = rf_model.transform(test_data).select("features", "prediction", "units").withColumnRenamed("prediction", "rf_prediction")
gb_predictions = gb_model.transform(test_data).select("features", "prediction", "units").withColumnRenamed("prediction", "gb_prediction")
dt_predictions = dt_model.transform(test_data).select("features", "prediction", "units").withColumnRenamed("prediction", "dt_prediction")


meta_features = rf_predictions.join(gb_predictions, on=["features", "units"], how="inner")
meta_features = meta_features.join(dt_predictions, on=["features", "units"], how="inner")
meta_features = meta_features.select("features", "rf_prediction", "gb_prediction", "dt_prediction", "units")


assembler_meta = VectorAssembler(inputCols=["rf_prediction", "gb_prediction", "dt_prediction"], outputCol="meta_features")
meta_features_assembled = assembler_meta.transform(meta_features)


meta_features_assembled = meta_features_assembled.withColumn("units", when(col("units") < 0, lit(0)).otherwise(col("units")))



Rows in test_data: 1184


In [54]:
print("RandomForestRegressor details:")
print(rf_model.stages[-1])  # RandomForestRegressor details

print("GBTRegressor details:")
print(gb_model.stages[-1])  # GBTRegressor details

print("DecisionTreeRegressor details:")
print(dt_model.stages[-1])  # DecisionTreeRegressor details


RandomForestRegressor details:
RandomForestRegressionModel: uid=RandomForestRegressor_bbcd5c5c20d2, numTrees=20, numFeatures=5278
GBTRegressor details:
GBTRegressionModel: uid=GBTRegressor_2301eff558be, numTrees=20, numFeatures=5278
DecisionTreeRegressor details:
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_2fc639d4f874, depth=5, numNodes=17, numFeatures=5278


In [55]:
# Check if meta_features_assembled has data and valid 'units' values
print("Number of rows in meta_features_assembled:", meta_features_assembled.count())
print("Summary of 'units' column:")
meta_features_assembled.select("units").summary().show()

Number of rows in meta_features_assembled: 93
Summary of 'units' column:
+-------+------------------+
|summary|             units|
+-------+------------------+
|  count|                93|
|   mean|1.7526881720430108|
| stddev|  5.51018013194569|
|    min|               0.0|
|    25%|               0.0|
|    50%|               0.0|
|    75%|               1.0|
|    max|              46.0|
+-------+------------------+



In [56]:
# Check the number of rows in each prediction DataFrame
print("Number of rows in rf_predictions:", rf_predictions.count())
print("Number of rows in gb_predictions:", gb_predictions.count())
print("Number of rows in dt_predictions:", dt_predictions.count())

Number of rows in rf_predictions: 93
Number of rows in gb_predictions: 93
Number of rows in dt_predictions: 93


In [22]:


meta_learner = LogisticRegression(featuresCol="meta_features", labelCol="units")
stacking_model = meta_learner.fit(meta_features_assembled)


final_predictions = stacking_model.transform(meta_features_assembled)


evaluator = RegressionEvaluator(labelCol="units", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(final_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

mse = evaluator.setMetricName("mse").evaluate(final_predictions)
print(f"Mean Squared Error (MSE) on test data = {mse}")

mae = evaluator.setMetricName("mae").evaluate(final_predictions)
print(f"Mean Absolute Error (MAE) on test data = {mae}")

r2 = evaluator.setMetricName("r2").evaluate(final_predictions)
print(f"R2 Score on test data = {r2}")

final_predictions.show(20)

Root Mean Squared Error (RMSE) on test data = 5.753914048864514
Mean Squared Error (MSE) on test data = 33.10752688172043
Mean Absolute Error (MAE) on test data = 1.7526881720430108
R2 Score on test data = -0.10227578932781056
+--------------------+-------------------+------------------+------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|      rf_prediction|     gb_prediction|     dt_prediction|units|       meta_features|       rawPrediction|         probability|prediction|
+--------------------+-------------------+------------------+------------------+-----+--------------------+--------------------+--------------------+----------+
|(5278,[0,5127,513...|  5.282654889728369| 4.557603682205905|5.7592592592592595|  0.0|[5.28265488972836...|[49.0816044392780...|[0.34322318652488...|       0.0|
|(5278,[0,5117,513...|  5.282654889728369| 4.557603682205905|5.7592592592592595|  0.0|[5.28265488972836...|[49.0816044392780...|[

In [51]:
print(f"Schema of rf_predictions:")
rf_predictions.printSchema()
rf_predictions.show(5)

print(f"Schema of gb_predictions:")
gb_predictions.printSchema()
gb_predictions.show(5)

print(f"Schema of dt_predictions:")
dt_predictions.printSchema()
dt_predictions.show(5)


Schema of rf_predictions:
root
 |-- features: vector (nullable = true)
 |-- rf_prediction: double (nullable = false)
 |-- units: double (nullable = true)

+--------------------+-----------------+-----+
|            features|    rf_prediction|units|
+--------------------+-----------------+-----+
|(5278,[0,5127,513...|5.282654889728369|  0.0|
|(5278,[0,5117,513...|5.282654889728369|  0.0|
|(5278,[0,5106,513...|5.282654889728369|  0.0|
|(5278,[0,5112,513...|5.282654889728369|  0.0|
|(5278,[0,5097,513...|5.282654889728369|  0.0|
+--------------------+-----------------+-----+
only showing top 5 rows

Schema of gb_predictions:
root
 |-- features: vector (nullable = true)
 |-- gb_prediction: double (nullable = false)
 |-- units: double (nullable = true)

+--------------------+-----------------+-----+
|            features|    gb_prediction|units|
+--------------------+-----------------+-----+
|(5278,[0,5127,513...|4.557603682205905|  0.0|
|(5278,[0,5117,513...|4.557603682205905|  0.0|
|(5278,

In [23]:
final_predictions.show(800)

+--------------------+-------------------+-------------------+------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|      rf_prediction|      gb_prediction|     dt_prediction|units|       meta_features|       rawPrediction|         probability|prediction|
+--------------------+-------------------+-------------------+------------------+-----+--------------------+--------------------+--------------------+----------+
|(5278,[0,5127,513...|  5.282654889728369|  4.557603682205905|5.7592592592592595|  0.0|[5.28265488972836...|[49.0816044392780...|[0.34322318652488...|       0.0|
|(5278,[0,5117,513...|  5.282654889728369|  4.557603682205905|5.7592592592592595|  0.0|[5.28265488972836...|[49.0816044392780...|[0.34322318652488...|       0.0|
|(5278,[0,5106,513...|  5.282654889728369|  4.557603682205905|5.7592592592592595|  0.0|[5.28265488972836...|[49.0816044392780...|[0.34322318652488...|       0.0|
|(5278,[0,5112,513...|  5.28

In [24]:
def evaluate_model(predictions, label_col="units", prediction_col="prediction"):
    evaluator = RegressionEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    evaluator.setMetricName("mse")
    mse = evaluator.evaluate(predictions)

    evaluator.setMetricName("mae")
    mae = evaluator.evaluate(predictions)

    evaluator.setMetricName("r2")
    r2 = evaluator.evaluate(predictions)

    return {"RMSE": rmse, "MSE": mse, "MAE": mae, "R2": r2}

# Evaluate each model
rf_metrics = evaluate_model(rf_predictions, label_col="units", prediction_col="rf_prediction")
gb_metrics = evaluate_model(gb_predictions, label_col="units", prediction_col="gb_prediction")
dt_metrics = evaluate_model(dt_predictions, label_col="units", prediction_col="dt_prediction")
stacking_metrics = evaluate_model(final_predictions, label_col="units", prediction_col="prediction")

# Print the metrics
print(f"Random Forest Metrics: {rf_metrics}")
print(f"Gradient Boosting Metrics: {gb_metrics}")
print(f"Decision Tree Metrics: {dt_metrics}")
print(f"Stacking Model Metrics: {stacking_metrics}")

# Show final predictions
#final_predictions.show(20)

Random Forest Metrics: {'RMSE': 5.150906253391567, 'MSE': 26.531835231228353, 'MAE': 1.9788381571827973, 'R2': 0.1166540549434748}
Gradient Boosting Metrics: {'RMSE': 5.171599405548487, 'MSE': 26.745440411469467, 'MAE': 1.8507877825539305, 'R2': 0.10954232414292431}
Decision Tree Metrics: {'RMSE': 5.15461333203553, 'MSE': 26.570038602798427, 'MAE': 1.9873149053256574, 'R2': 0.11538211905702722}
Stacking Model Metrics: {'RMSE': 5.753914048864514, 'MSE': 33.10752688172043, 'MAE': 1.7526881720430108, 'R2': -0.10227578932781056}


In [25]:
# Inspect the schema of the test dataset
test_df.printSchema()


NameError: name 'test_df' is not defined

In [48]:
from pyspark.sql.functions import monotonically_increasing_id, to_date
from pyspark.sql.types import StringType

# Load the test data
test_data_path = "/content/test.csv"
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)

# Convert numeric columns from string to double
numeric_columns = ["ad_spend", "unit_price"]
for column in numeric_columns:
    test_df = test_df.withColumn(column, col(column).cast("double"))

# Add unique identifier to the original test data
test_df_with_id = test_df.withColumn("unique_id", monotonically_increasing_id())

# Handle date conversion, assuming 'date' needs to be parsed and converted
test_df_with_id = test_df_with_id.withColumn("date", to_date(col("date"), "MM-dd-yyyy"))  # Adjust date format if needed
test_df_with_id = test_df_with_id.withColumn("date", col("date").cast(StringType()))


unique_ids_count = test_df_with_id.select("unique_id").distinct().count()
print(f"Unique IDs in test_df_with_id: {unique_ids_count}")

print(f"Rows in test_df_with_id DataFrame: {test_df_with_id.count()}")
# Generate predictions
rf_test_predictions = rf_model.transform(test_df_with_id).select("unique_id", "features", "prediction").withColumnRenamed("prediction", "rf_prediction")
gb_test_predictions = gb_model.transform(test_df_with_id).select("unique_id", "features", "prediction").withColumnRenamed("prediction", "gb_prediction")
dt_test_predictions = dt_model.transform(test_df_with_id).select("unique_id", "features", "prediction").withColumnRenamed("prediction", "dt_prediction")

# Combine predictions
meta_test_features = rf_test_predictions.join(gb_test_predictions, on="unique_id", how="inner")
meta_test_features = meta_test_features.join(dt_test_predictions, on="unique_id", how="inner")

print(f"Rows in rf_test_predictions DataFrame: {rf_test_predictions.count()}")


# Prepare meta-features for the meta-learner
assembler_meta = VectorAssembler(inputCols=["rf_prediction", "gb_prediction", "dt_prediction"], outputCol="meta_features")
meta_test_features_assembled = assembler_meta.transform(meta_test_features)

# Generate final predictions
final_test_predictions = stacking_model.transform(meta_test_features_assembled)
print(f"Rows in before join  DataFrame: {final_test_predictions.count()}")
# Include original columns with the final predictions
final_test_predictions_with_originals = final_test_predictions.join(test_df_with_id, on="unique_id", how="inner")
final_test_predictions_with_originals = final_test_predictions_with_originals.select(
    "ID", "date", "Item Id", "Item Name", "ad_spend", "anarix_id", "unit_price", "prediction"
)

print(f"Rows in test DataFrame: {test_df.count()}")
print(f"Rows in predictions DataFrame: {final_test_predictions_with_originals.count()}")
output_path = "/content/final_predictions2.csv"
final_test_predictions_with_originals.write.mode("overwrite").csv(output_path, header=True)

# Show the first few rows
final_test_predictions_with_originals.show(20)


Unique IDs in test_df_with_id: 2833
Rows in test_df_with_id DataFrame: 2833
Rows in rf_test_predictions DataFrame: 850
Rows in before join  DataFrame: 850
Rows in test DataFrame: 2833
Rows in predictions DataFrame: 850
+--------------------+----------+----------+--------------------+--------+---------+------------------+----------+
|                  ID|      date|   Item Id|           Item Name|ad_spend|anarix_id|        unit_price|prediction|
+--------------------+----------+----------+--------------------+--------+---------+------------------+----------+
|2024-07-01_B0BDRT...|2024-07-01|B0BDRTZTGX|Parent asin - rai...|     0.0| NAPQUEEN|               0.0|       0.0|
|2024-07-01_B0B699...|2024-07-01|B0B699PLXD|NapQueen Victoria...|     0.0| NAPQUEEN|               0.0|       0.0|
|2024-07-01_B0BDRQ...|2024-07-01|B0BDRQWBK9|NapQueen 5 Inch R...|     0.0| NAPQUEEN|               0.0|       0.0|
|2024-07-01_B0CY5Q...|2024-07-01|B0CY5QQ49F|                NULL|   12.43| NAPQUEEN|       

In [49]:
print(rf_model.stages)
print(gb_model.stages)
print(dt_model.stages)


[StringIndexerModel: uid=StringIndexer_0847847726a0, handleInvalid=keep, StringIndexerModel: uid=StringIndexer_096a0020ef4e, handleInvalid=keep, StringIndexerModel: uid=StringIndexer_ca7a31d3d8ab, handleInvalid=keep, StringIndexerModel: uid=StringIndexer_f7be8c49fd9f, handleInvalid=keep, StringIndexerModel: uid=StringIndexer_b2304c2a589a, handleInvalid=keep, OneHotEncoderModel: uid=OneHotEncoder_cd21173c7f42, dropLast=true, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_429f585777ef, dropLast=true, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_03c9b623da3b, dropLast=true, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_4cb882ce841c, dropLast=true, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_baf221a4ed48, dropLast=true, handleInvalid=error, VectorAssembler_fb0ef21e4646, RandomForestRegressionModel: uid=RandomForestRegressor_38686f9a8461, numTrees=20, numFeatures=5278]
[StringIndexerModel: uid=StringIndexer_0847847726a0, handleInvalid=ke

In [40]:
import pyspark.sql.functions as F


submission = final_test_predictions_with_originals.toPandas()

In [41]:
submission.to_csv('submission.csv', index=False)

In [42]:
print(f"Rows in test DataFrame: {test_df.count()}")
print(f"Rows in predictions DataFrame: {final_test_predictions_with_originals.count()}")

Rows in test DataFrame: 2833
Rows in predictions DataFrame: 850


# FINAL GBT

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .appName("GBTModel") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

data_path = "/content/df.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)


df = df.withColumn("units", when(col("units") < 0, lit(0)).otherwise(col("units")))

numeric_columns = ["ad_spend", "unit_price", "units"]
for column in numeric_columns:
    df = df.withColumn(column, col(column).cast("double"))


train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


categorical_features = ['Item Id', 'Item Name', 'anarix_id']
numeric_features = ['ad_spend', 'unit_price']


indexers = [StringIndexer(inputCol=feature, outputCol=feature + "_index", handleInvalid='skip') for feature in categorical_features]
encoders = [OneHotEncoder(inputCol=feature + "_index", outputCol=feature + "_ohe") for feature in categorical_features]


assembler = VectorAssembler(
    inputCols=numeric_features + [feature + "_ohe" for feature in categorical_features],
    outputCol="features",
    handleInvalid="skip"
)


gbt_regressor = GBTRegressor(featuresCol="features", labelCol="units")

pipeline_gbt = Pipeline(stages=indexers + encoders + [assembler, gbt_regressor])


gbt_model = pipeline_gbt.fit(train_data)

gbt_predictions = gbt_model.transform(test_data).select("features", "prediction", "units")

gbt_metrics = evaluate_model(gbt_predictions, label_col="units", prediction_col="prediction")

#  metrics
print(f"Gradient Boosted Trees Metrics: {gbt_metrics}")

gbt_predictions.show(20)


Gradient Boosted Trees Metrics: {'RMSE': 5.256739369268825, 'MSE': 27.633308796420803, 'MAE': 2.369182635836352, 'R2': 0.07998180069042227}
+--------------------+------------------+-----+
|            features|        prediction|units|
+--------------------+------------------+-----+
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  4.0|
|(149,[0,41,80,92]...|1.1722295017657396|  0.0|
|(149,[0,41,80,92]...|1.1722295017657396|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  2.0|
|(149,[0,6,79,92],...| 7.076991406527646|  8.0|
|(149,[0,41,80,92]...|1.1722295017657396|  0.0|
|(149,[0,41,80,92]...|1.1722295017657396|  0.0|
|(149,[0,41,80,92]...|1.1722295017657396|  0.0|
|(149,[0,6,79,92],...| 7.076991406527646|  6

In [6]:
gbt_model

PipelineModel_c082dab27be3

In [7]:
import pandas as pd
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)
test_df.head()

NameError: name 'test_data_path' is not defined

In [None]:
from pyspark.sql.functions import col, round

# Load test data
test_data_path = "/content/test.csv"  # Update this path to the actual location of your test data
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)

# Convert columns to the same numeric format
for column in numeric_features:
    test_df = test_df.withColumn(column, col(column).cast("double"))

# Transform the test data using the trained GBT model pipeline
test_predictions = gbt_model.transform(test_df)

# Round off the TARGET values to the nearest integer and cast to integer type
formatted_predictions = test_predictions.select(
    col("ID"),
    col("prediction").cast("int").alias("TARGET")  # Cast to integer
)

# Show the formatted predictions
formatted_predictions.show(truncate=False)

# Coalesce the DataFrame into a single partition before saving
formatted_predictions = formatted_predictions.coalesce(1)

# Save the formatted predictions to a CSV file
formatted_predictions_path = "/content/predictions.csv"  # Update this path as needed
formatted_predictions.write.csv(formatted_predictions_path, header=True, mode='overwrite')

print(f"Predictions saved to {formatted_predictions_path}")


In [None]:
import pandas as pd
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)

# Get number of rows
num_rows = test_df.count()
# Get number of columns
num_cols = len(test_df.columns)

print("Shape: ({}, {})".format(num_rows, num_cols))

# To view the first few rows of the DataFrame
test_df.show(5)

In [None]:
test_df

In [None]:
formatted_predictions.printSchema()


In [None]:
formatted_predictions_path = "/content/formatted_predictions.csv"
formatted_predictions.write.csv(formatted_predictions_path, header=True, mode='overwrite', sep=',')


In [None]:
import pyspark.sql.functions as F


submission = formatted_predictions.toPandas()

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
print(f"Rows in test DataFrame: {test_df.count()}")
print(f"Rows in predictions DataFrame: {test_predictions.count()}")


In [None]:
# Load test data
test_data_path = "/content/test.csv"  # Path to your test data file
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)

# Get number of rows and columns in the test data
num_rows = test_df.count()
num_cols = len(test_df.columns)
print(f"Rows in test DataFrame: {num_rows}")
print(f"Columns in test DataFrame: {num_cols}")

# Show the first few rows of the test DataFrame
test_df.show(5)


In [None]:
# Convert columns to double and check for any potential issues
for column in numeric_features:
    test_df = test_df.withColumn(column, col(column).cast("double"))

# Check number of rows after transformations
print(f"Rows after transformations: {test_df.count()}")
test_df.show(5)


In [None]:
# List columns in the test data and the features used in the model
print(f"Test Data Columns: {test_df.columns}")
print(f"Model Features: {assembler.getInputCols()}")


In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.sql.functions import col, round
# Start a Spark session
spark = SparkSession.builder.appName("StackingModel").getOrCreate()



# Load test data
test_data_path = "/content/test.csv"  # Update this path to the actual location of your test data
test_df = spark.read.csv(test_data_path, header=True, inferSchema=True)

# Convert columns to the same numeric format
for column in numeric_features:
    test_df = test_df.withColumn(column, col(column).cast("double"))

#
# Define the features
categorical_features = ['Item Id', 'Item Name']
numeric_features = ['ad_spend', 'unit_price']

# Define indexers and encoders
indexers = [StringIndexer(inputCol=feature, outputCol=feature + "_index", handleInvalid='skip') for feature in categorical_features]
encoders = [OneHotEncoder(inputCol=feature + "_index", outputCol=feature + "_ohe") for feature in categorical_features]

# Define assembler
assembler = VectorAssembler(
    inputCols=numeric_features + [feature + "_ohe" for feature in categorical_features],
    outputCol="features",
    handleInvalid="skip"
)

# Combine all stages into a single pipeline
stages = indexers + encoders + [assembler]
pipeline = Pipeline(stages=stages)

# Fit and transform the training data
pipeline_model = pipeline.fit(train_data)
assembled_train_data = pipeline_model.transform(train_data)
assembled_train_data.printSchema()

# Transform the test data
assembled_test_data = pipeline_model.transform(test_data)
assembled_test_data.printSchema()

# Perform predictions
test_predictions = gbt_model.transform(assembled_test_data)
formatted_predictions = test_predictions.select(
    col("ID"),
    col("prediction").cast("int").alias("TARGET")  # Round to integer
)

# Save predictions
formatted_predictions_path = "/content/formatted_predictions.csv"
formatted_predictions.coalesce(1).write.csv(formatted_predictions_path, header=True, mode='overwrite', sep=',')

print(f"Predictions saved to {formatted_predictions_path}")

# Verify the number of rows in test data and predictions
print(f"Rows in test DataFrame: {test_data.count()}")
print(f"Rows in predictions DataFrame: {test_predictions.count()}")


LR

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Start a new Spark session
spark = SparkSession.builder \
    .appName("LinearRegressionModel") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Load data
data_path = "/content/df.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Replace negative values in 'units' with 0
df = df.withColumn("units", when(col("units") < 0, lit(0)).otherwise(col("units")))

# Convert columns to numeric
numeric_columns = ["ad_spend", "unit_price", "units"]
for column in numeric_columns:
    df = df.withColumn(column, col(column).cast("double"))

# Split data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Define categorical and numeric features
categorical_features = ['Item Id', 'Item Name', 'anarix_id']
numeric_features = ['ad_spend', 'unit_price']

# Index and encode categorical features
indexers = [StringIndexer(inputCol=feature, outputCol=feature + "_index", handleInvalid='skip') for feature in categorical_features]
encoders = [OneHotEncoder(inputCol=feature + "_index", outputCol=feature + "_ohe") for feature in categorical_features]

# Combine all features into a feature vector
assembler = VectorAssembler(
    inputCols=numeric_features + [feature + "_ohe" for feature in categorical_features],
    outputCol="features",
    handleInvalid="skip"
)

# Define the Linear Regression model
lr_regressor = LinearRegression(featuresCol="features", labelCol="units")

# Create pipeline for Linear Regression model
pipeline_lr = Pipeline(stages=indexers + encoders + [assembler, lr_regressor])

# Train the Linear Regression model
lr_model = pipeline_lr.fit(train_data)

# Generate predictions from Linear Regression model
lr_predictions = lr_model.transform(test_data).select("features", "prediction", "units")

# Function to evaluate model predictions
def evaluate_model(predictions, label_col="units", prediction_col="prediction"):
    evaluator = RegressionEvaluator(labelCol=label_col, predictionCol=prediction_col, metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    evaluator.setMetricName("mse")
    mse = evaluator.evaluate(predictions)

    evaluator.setMetricName("mae")
    mae = evaluator.evaluate(predictions)

    evaluator.setMetricName("r2")
    r2 = evaluator.evaluate(predictions)

    return {"RMSE": rmse, "MSE": mse, "MAE": mae, "R2": r2}

# Evaluate the Linear Regression model
lr_metrics = evaluate_model(lr_predictions, label_col="units", prediction_col="prediction")

# Print the metrics
print(f"Linear Regression Metrics: {lr_metrics}")

# Show final predictions
lr_predictions.show(20)
