In [25]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer

In [26]:
df_train=pd.read_csv("/kaggle/input/playground-series-s5e1/train.csv")
df_test=pd.read_csv("/kaggle/input/playground-series-s5e1/test.csv")


In [27]:
df_train

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [28]:
# Custom transformer to handle datetime extraction
class DateTimeExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["date"] = pd.to_datetime(X["date"])
        X["year"] = X["date"].dt.year
        X["month"] = X["date"].dt.month
        X["day"] = X["date"].dt.day
        X["day_of_week"] = X["date"].dt.dayofweek
        X["is_weekend"] = X["day_of_week"].isin([5, 6]).astype(int)
        X = X.drop(columns=["date"])
        return X

In [29]:
# Custom transformer to drop unnecessary columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")


In [30]:
# Define the preprocessing pipeline
def create_pipeline():
    # Columns for processing
    one_hot_cols = ["country", "store", "product", "is_weekend"]
    numerical_cols = ["year", "month", "day", "day_of_week"]
    target_col = "num_sold"

    # Preprocessing for numerical and categorical data
    numerical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = OneHotEncoder(sparse=False, drop="first")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_cols),
            ("cat", categorical_transformer, one_hot_cols),
        ]
    )

    # Full pipeline
    pipeline = Pipeline(
        steps=[
            ("date_extractor", DateTimeExtractor()),
            ("drop_columns", DropColumns(columns=["id"])),
            ("preprocessor", preprocessor),
            ("model", RandomForestRegressor(random_state=42)),
        ]
    )

    return pipeline

In [31]:
# Preprocess training data
pipeline = create_pipeline()

# Fill missing target values in the training data
df_train["num_sold"].fillna(df_train["num_sold"].mean(), inplace=True)

# Separate features and target
X_train = df_train.drop(columns=["num_sold"])
y_train = df_train["num_sold"]

# Train the pipeline
pipeline.fit(X_train, y_train)

# Preprocess and predict for the test data
predictions = pipeline.predict(df_test)

# Create a dataframe to display the predictions
df_test["num_sold"] = predictions
print(df_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["num_sold"].fillna(df_train["num_sold"].mean(), inplace=True)


           id        date    country                 store  \
0      230130  2017-01-01     Canada     Discount Stickers   
1      230131  2017-01-01     Canada     Discount Stickers   
2      230132  2017-01-01     Canada     Discount Stickers   
3      230133  2017-01-01     Canada     Discount Stickers   
4      230134  2017-01-01     Canada     Discount Stickers   
...       ...         ...        ...                   ...   
98545  328675  2019-12-31  Singapore  Premium Sticker Mart   
98546  328676  2019-12-31  Singapore  Premium Sticker Mart   
98547  328677  2019-12-31  Singapore  Premium Sticker Mart   
98548  328678  2019-12-31  Singapore  Premium Sticker Mart   
98549  328679  2019-12-31  Singapore  Premium Sticker Mart   

                  product     num_sold  
0       Holographic Goose   752.527382  
1                  Kaggle   759.660000  
2            Kaggle Tiers   672.240000  
3                Kerneler   363.100000  
4      Kerneler Dark Mode   405.500000  
...      