In [140]:
import pandas as pd
import numpy as np
import holidays
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [141]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')



In [142]:
test_df.head()

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode


In [143]:
train_df.drop_duplicates(inplace=True)

In [144]:
train_df.dropna(inplace=True)

In [145]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 221259 entries, 1 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        221259 non-null  int64  
 1   date      221259 non-null  object 
 2   country   221259 non-null  object 
 3   store     221259 non-null  object 
 4   product   221259 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 11.8+ MB


In [146]:
train_df['date'] = pd.to_datetime(train_df['date'])


In [147]:
train_df.head()

Unnamed: 0,id,date,country,store,product,num_sold
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
5,5,2010-01-01,Canada,Stickers for Less,Holographic Goose,300.0


In [148]:
def get_gdp_per_capita(country, year):
    alpha3 = {
        'Canada': 'CAN', 'Finland': 'FIN', 'Italy': 'ITA',
        'Kenya': 'KEN', 'Norway': 'NOR', 'Singapore': 'SGP'
    }
    url = f"https://api.worldbank.org/v2/country/{alpha3[country]}/indicator/NY.GDP.PCAP.CD?date={year}&format=json"
    response = requests.get(url).json()
    try:
        return response[1][0]['value']
    except (IndexError, TypeError):
        return None

countries = ['Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore']
years = range(2010, 2020)
gdp_data = {}

for country in countries:
    for year in years:
        gdp_data[(country, year)] = get_gdp_per_capita(country, year)

def add_gdp_feature(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['gdp'] = df.apply(lambda row: gdp_data.get((row['country'], row['year']), None), axis=1)
    return df


def add_holiday_feature(df):
    country_holidays = {
        'Canada': holidays.CountryHoliday('CA'),
        'Finland': holidays.CountryHoliday('FI'),
        'Italy': holidays.CountryHoliday('IT'),
        'Kenya': holidays.CountryHoliday('KE'),
        'Norway': holidays.CountryHoliday('NO'),
        'Singapore': holidays.CountryHoliday('SG')
    }

    df['date'] = pd.to_datetime(df['date'])

    df['is_holiday'] = df.apply(
        lambda row: row['date'] in country_holidays.get(row['country'], []), axis=1
    )
    return df



def feature_engineering(train, test):
    train_df = train.copy()
    test_df = test.copy()

    train_df = train_df.dropna()

    train_df = add_holiday_feature(train_df)
    test_df = add_holiday_feature(test_df)
    train_df = add_gdp_feature(train_df)
    test_df = add_gdp_feature(test_df)

    product_ratio = train_df.groupby('product')['num_sold'].mean() / train_df['num_sold'].mean()
    store_ratio = train_df.groupby('store')['num_sold'].mean() / train_df['num_sold'].mean()
    country_ratio = train_df.groupby('country')['num_sold'].mean() / train_df['num_sold'].mean()
    product_ratio_mean = product_ratio.mean()
    store_ratio_mean = store_ratio.mean()
    country_ratio_mean = country_ratio.mean()


    for df in [train_df, test_df]:
        df['product_ratio'] = df['product'].map(product_ratio)
        df['store_ratio'] = df['store'].map(store_ratio)
        df['country_ratio'] = df['country'].map(country_ratio)
        df['product_ratio'].fillna(product_ratio_mean, inplace=True)
        df['store_ratio'].fillna(store_ratio_mean, inplace=True)
        df['country_ratio'].fillna(country_ratio_mean, inplace=True)

        df['date'] = df['date'].astype('datetime64[ns]')

        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['day_of_week'] = df['date'].dt.dayofweek
        df["day_of_week"] = df["day_of_week"].apply(lambda x: 0 if x<=3 else(1 if x==4 else (2 if x==5 else (3))))
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df["day_of_year"] = df['date'].apply(
            lambda x: x.timetuple().tm_yday if not (x.is_leap_year and x.month > 2) else x.timetuple().tm_yday - 1
        )
        df.drop('date', axis=1, inplace=True)

        df['day_sin4'] = np.sin(df['day_of_year'] * (8 * np.pi /  365.0))
        df['day_cos4'] = np.cos(df['day_of_year'] * (8 * np.pi /  365.0))
        df['day_sin3'] = np.sin(df['day_of_year'] * (6 * np.pi /  365.0))


        df['day_cos3'] = np.cos(df['day_of_year'] * (6 * np.pi /  365.0))
        df['day_sin2'] = np.sin(df['day_of_year'] * (4 * np.pi /  365.0))
        df['day_cos2'] = np.cos(df['day_of_year'] * (4 * np.pi /  365.0))
        df['day_sin'] = np.sin(df['day_of_year'] * (2 * np.pi /  365.0))
        df['day_cos'] = np.cos(df['day_of_year'] * (2 * np.pi /  365.0))
        df['day_sin_0.5'] = np.sin(df['day_of_year'] * (1 * np.pi /  365.0))
        df['day_cos_0.5'] = np.cos(df['day_of_year'] * (1 * np.pi /  365.0))
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0)
        df['year_sin'] = np.sin(2 * np.pi * df['year'] / 7.0)
        df['year_cos'] = np.cos(2 * np.pi * df['year'] / 7.0)


        df['month_country'] = df['month'].astype(str) + "_" + df['country']
        df['month_store'] = df['month'].astype(str) + "_" + df['store']
        df['month_product'] = df['month'].astype(str) + "_" + df['product']

        df['country_store'] = df['country'] + "_" + df['store']
        df['country_product'] = df['country'] + "_" + df['product']
        df['store_product'] = df['store'] + "_" + df['product']


        dummy_columns = [
        'country', 'store', 'product',
        'month_country', 'month_store','month_product',
        'country_store', 'country_product', 'store_product'
    ]
    train_df = pd.get_dummies(train_df, columns=dummy_columns, drop_first=True)
    test_df = pd.get_dummies(test_df, columns=dummy_columns, drop_first=True)

    train_df['num_sold'] = np.log1p(train_df['num_sold'])
    return train_df, test_df

In [149]:
df,test = feature_engineering(train_df,test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['product_ratio'].fillna(product_ratio_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['store_ratio'].fillna(store_ratio_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [150]:

df.head()

Unnamed: 0,id,num_sold,is_holiday,year,gdp,product_ratio,store_ratio,country_ratio,month,day,...,store_product_Premium Sticker Mart_Holographic Goose,store_product_Premium Sticker Mart_Kaggle,store_product_Premium Sticker Mart_Kaggle Tiers,store_product_Premium Sticker Mart_Kerneler,store_product_Premium Sticker Mart_Kerneler Dark Mode,store_product_Stickers for Less_Holographic Goose,store_product_Stickers for Less_Kaggle,store_product_Stickers for Less_Kaggle Tiers,store_product_Stickers for Less_Kerneler,store_product_Stickers for Less_Kerneler Dark Mode
1,1,6.881411,True,2010,47560.666601,1.637661,0.568452,1.116298,1,1,...,False,False,False,False,False,False,False,False,False,False
2,2,6.810142,True,2010,47560.666601,1.353232,0.568452,1.116298,1,1,...,False,False,False,False,False,False,False,False,False,False
3,3,6.049733,True,2010,47560.666601,0.740057,0.568452,1.116298,1,1,...,False,False,False,False,False,False,False,False,False,False
4,4,6.198479,True,2010,47560.666601,0.865141,0.568452,1.116298,1,1,...,False,False,False,False,False,False,False,False,False,False
5,5,5.70711,True,2010,47560.666601,0.262428,1.1163,1.116298,1,1,...,False,False,False,False,False,True,False,False,False,False


In [151]:
X = df.drop(columns=['id', 'num_sold'])
y = df['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [152]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

In [153]:
model = LinearRegression()
model.fit(X_train,y_train)

In [154]:
y_pred = model.predict(X_test)

In [155]:
mse = mean_squared_error(y_test,y_pred)
mape = mean_absolute_percentage_error(y_test,y_pred)*100

In [156]:
print(mse,mape)

0.007764974890935609 1.4152883461162782


In [157]:
#ridge regression

In [158]:
from sklearn.linear_model import BayesianRidge
model = BayesianRidge()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mape = mean_absolute_percentage_error(y_test,y_pred)*100
mse

0.007771392773484819

In [159]:
from sklearn.linear_model import Ridge
alphas=[0.01,0.1,1.0,10.0,100.0]
min_mse=1000000000
for alpha in alphas:
  ridge = Ridge(alpha=alpha)
  ridge.fit(X_train,y_train)
  y_pred=ridge.predict(X_test)
  mse = mean_squared_error(y_test,y_pred)
  if(mse<min_mse):
    min_mse=mse
mse

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.008172009091255051

In [160]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1,l1_ratio=0.5,random_state=42)
elastic_net.fit(X_train,y_train)
y_pre = elastic_net.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mse

0.008172009091255051

In [161]:
from xgboost import XGBRegressor

In [162]:
xgb_reg = XGBRegressor(
    n_estimators = 100,
    max_depth = 4,
    learning_rate = 0.1,
    random_state=42
)
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test,y_pred)*100
mape

1.1623558801784066

In [163]:
!pip install "dask[dataframe]"

!pip install --upgrade lightgbm

import lightgbm as lgb



In [164]:

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}


model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    valid_names=['valid'],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(10)]
)


y_pred = model.predict(X_test)


rmse = mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
mape = mean_absolute_percentage_error(y_test,y_pred)*100
mape

Training until validation scores don't improve for 10 rounds
Root Mean Squared Error (RMSE): 0.0060


1.209174571506095

In [165]:
test.drop('id',axis=1,inplace=True)
test = test[X.columns]
y_to_submit = np.expm1(model.predict(test))
to_submit = pd.DataFrame(
    data={
        'id' : test_df['id'],
        'num_sold' : y_to_submit
    }
)
to_submit.head()

Unnamed: 0,id,num_sold
0,230130,132.734684
1,230131,827.511037
2,230132,732.362557
3,230133,389.377924
4,230134,483.473991


In [166]:
to_submit.to_csv('fsubmission.csv', index=False)