In [152]:
import pandas as pd
final_train = pd.read_csv("final_merge_datasets_5/final_merge_train_dataset.csv",parse_dates=['date'])
final_test = pd.read_csv("final_merge_datasets_5/final_merge_test_dataset.csv",parse_dates=['date'])

In [153]:
print(final_test.dtypes)

id                      int64
date           datetime64[ns]
store_nbr               int64
item_nbr                int64
onpromotion              bool
family                 object
class                   int64
perishable              int64
dcoilwtico            float64
city                   object
state                  object
store_grade            object
cluster                 int64
dtype: object


In [154]:
final_train['onpromotion'].unique()

array([False])

## Checking for Missing (Null) Values if yes then remove it

In [155]:
final_train.isnull().sum()

id              0
date            0
store_nbr       0
item_nbr        0
unit_sales      0
onpromotion     0
Day_type        0
locale          0
locale_name     0
description     0
transferred     0
family          0
class           0
perishable      0
dcoilwtico      0
city            0
state           0
store_grade     0
cluster         0
transactions    0
city_state      0
day_of_week     0
day_name        0
dtype: int64

## Handling Categorical Variables

In [156]:
final_test.dtypes

id                      int64
date           datetime64[ns]
store_nbr               int64
item_nbr                int64
onpromotion              bool
family                 object
class                   int64
perishable              int64
dcoilwtico            float64
city                   object
state                  object
store_grade            object
cluster                 int64
dtype: object

In [157]:
final_train['day_name'].unique()

array(['Tuesday', 'Wednessday', 'Thursday', 'Friday', 'Satureday',
       'Sunday', 'Monday'], dtype=object)

In [None]:
categorical_column = final_train.select_dtypes(include=['object']).columns
print(categorical_column)

categorical_columns = final_test.select_dtypes(include=['object']).columns
print(categorical_columns)

Index(['Day_type', 'locale', 'locale_name', 'description', 'family', 'city',
       'state', 'store_grade', 'city_state', 'day_name'],
      dtype='object')
Index(['family', 'city', 'state', 'store_grade'], dtype='object')


In [159]:
final_train = pd.get_dummies(final_train,columns = categorical_column)
final_train.dtypes
final_test = pd.get_dummies(final_test, columns = categorical_columns)
final_test.dtypes

id                        int64
date             datetime64[ns]
store_nbr                 int64
item_nbr                  int64
onpromotion                bool
                      ...      
store_grade_A              bool
store_grade_B              bool
store_grade_C              bool
store_grade_D              bool
store_grade_E              bool
Length: 85, dtype: object

In [160]:
print(final_train.head())
print(final_test.head())

   id       date  store_nbr  item_nbr  unit_sales  onpromotion  transferred  \
0   0 2013-01-01         25    103665         7.0        False        False   
1   1 2013-01-01         25    105574         1.0        False        False   
2   2 2013-01-01         25    105575         2.0        False        False   
3   3 2013-01-01         25    108079         1.0        False        False   
4   4 2013-01-01         25    108701         1.0        False        False   

   class  perishable  dcoilwtico  ...  city_state_Riobamba, Chimborazo  \
0   2712           1   67.714366  ...                            False   
1   1045           0   67.714366  ...                            False   
2   1045           0   67.714366  ...                            False   
3   1030           0   67.714366  ...                            False   
4   2644           1   67.714366  ...                            False   

   city_state_Salinas, Santa Elena  \
0                             True   
1   

In [161]:
print(final_train.dtypes.value_counts())
print(final_test.dtypes.value_counts())

bool              99
int64              8
float64            2
datetime64[ns]     1
Name: count, dtype: int64
bool              77
int64              6
datetime64[ns]     1
float64            1
Name: count, dtype: int64


Checking data is normally distributed or 

In [162]:
#import matplotlib.pyplot as plt
#import seaborn as sns

# Example for one feature, like 'unit_sales'
#sns.histplot(final_train['unit_sales'], kde=True)
#plt.show()

In [163]:
# First, align columns
final_test_df = final_test.reindex(columns=final_train.drop('unit_sales', axis=1).columns, fill_value=0)


## Feature Scaling

### Normilization (MinMaxScaler) Works
### Scaled Value = (X - X min)/(X max - X min)

### Where:
### X = current value
### X min = minimum value in column
### X max = maximum value in column

In [164]:
from sklearn.preprocessing import MinMaxScaler

In [165]:
train_scale = final_train.copy()

In [166]:
y = train_scale['unit_sales']
x = train_scale.drop(['unit_sales','date'], axis=1)

x_test = final_test.copy()

### 2. Align final_test columns to match x columns

In [167]:
# Align columns
x_test = x_test.reindex(columns=x.columns, fill_value=0)


y = What we are trying to predict = unit_sales

X = All other columns (except unit_sales and date) that help predict y

.drop() = Remove columns

axis=1 = Means remove column, not row

## 3. Create and Fit MinMaxScaler

In [168]:
scaler = MinMaxScaler()

In [169]:
x_scaled = scaler.fit_transform(x)
x_test_scaler = scaler.transform(x_test)

## 4. Convert Scaled Data back to DataFrame

In [170]:
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)
x_test_scaler = pd.DataFrame(x_test_scaler, columns=x_test.columns)

In [171]:
print(x_scaled.head())

         id  store_nbr  item_nbr  onpromotion  transferred     class  \
0  0.000000    0.45283  0.006528          0.0          0.0  0.283488   
1  0.000001    0.45283  0.008397          0.0          0.0  0.007129   
2  0.000002    0.45283  0.008398          0.0          0.0  0.007129   
3  0.000003    0.45283  0.010849          0.0          0.0  0.004642   
4  0.000004    0.45283  0.011458          0.0          0.0  0.272215   

   perishable  dcoilwtico  cluster  transactions  ...  \
0         1.0         0.0      0.0      0.053739  ...   
1         0.0         0.0      0.0      0.053739  ...   
2         0.0         0.0      0.0      0.053739  ...   
3         0.0         0.0      0.0      0.053739  ...   
4         1.0         0.0      0.0      0.053739  ...   

   city_state_Riobamba, Chimborazo  city_state_Salinas, Santa Elena  \
0                              0.0                              1.0   
1                              0.0                              1.0   
2          

In [172]:
print(x_test_scaler.head())

           id  store_nbr  item_nbr  onpromotion  transferred     class  \
0  125.497165        0.0  0.000000          0.0          0.0  0.015086   
1  125.497166        0.0  0.002155          0.0          0.0  0.010776   
2  125.497167        0.0  0.006368          0.0          0.0  0.332560   
3  125.497168        0.0  0.006386          0.0          0.0  0.004310   
4  125.497169        0.0  0.006528          0.0          0.0  0.283488   

   perishable  dcoilwtico  cluster  transactions  ...  \
0         0.0   -0.737054     0.75     -0.103596  ...   
1         0.0   -0.737054     0.75     -0.103596  ...   
2         0.0   -0.737054     0.75     -0.103596  ...   
3         0.0   -0.737054     0.75     -0.103596  ...   
4         1.0   -0.737054     0.75     -0.103596  ...   

   city_state_Riobamba, Chimborazo  city_state_Salinas, Santa Elena  \
0                              0.0                              0.0   
1                              0.0                              0.0   

In [173]:
x_test_scaler.shape

(3370464, 108)

## Prepare Validation

In [174]:
from sklearn.model_selection import train_test_split

In [None]:


# Assume you already have x_scaled and y ready

# Split into training and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(
    x_scaled, y, test_size=0.2, random_state=42,shuffle=True)

# Checking the shape of the splits
print("x_train shape:", x_train.shape)
print("x_valid shape:", x_valid.shape)
print("y_train shape:", y_train.shape)
print("y_valid shape:", y_valid.shape)


x_train shape: (800000, 108)
x_valid shape: (200000, 108)
y_train shape: (800000,)
y_valid shape: (200000,)


In [176]:
final_train.to_csv("data_for_model/train.csv", index=False)
final_test.to_csv("data_for_model/test.csv", index=False)

In [177]:
final_test.dtypes

id                        int64
date             datetime64[ns]
store_nbr                 int64
item_nbr                  int64
onpromotion                bool
                      ...      
store_grade_A              bool
store_grade_B              bool
store_grade_C              bool
store_grade_D              bool
store_grade_E              bool
Length: 85, dtype: object

In [178]:
x_train.to_csv("data_for_model/x_train.csv", index=False)
x_valid.to_csv("data_for_model/x_valid.csv", index=False)
x_test_scaler.to_csv("data_for_model/x_test_scaler.csv", index=False)


In [179]:
x_scaled.to_csv("data_for_model/x_train_df.csv", index=False)

In [180]:
y_train.to_frame(name="unit_sales")\
       .to_csv("data_for_model/y_train.csv", index=False)
y_valid.to_frame(name="unit_sales")\
       .to_csv("data_for_model/y_valid.csv", index=False)