<a href="https://colab.research.google.com/github/StefanKDS/DL_Training/blob/main/TabularPlaygroundSeries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Load data

In [10]:
pip install kaggle



In [75]:
from google.colab import files

files.upload()
! mkdir ~/.kaggle 
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jan-2022
! mkdir train
! unzip train.csv.zip -d train

Saving kaggle.json to kaggle (3).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
test.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
mkdir: cannot create directory ‘train’: File exists
Archive:  train.csv.zip
replace train/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: train/train.csv         


## Load csv and feature engineering

In [1]:
import pandas as pd

pd.set_option('display.max_columns',None)

train_df = pd.read_csv('train/train.csv')

Y_train = train_df['num_sold']
train_df.drop('num_sold', axis=1, inplace=True)

train_df.head()

Unnamed: 0,row_id,date,country,store,product
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat


In [2]:
from datetime import date, datetime

def season_of_date(date):
    date_time_obj = datetime.strptime(date, '%Y-%m-%d')
    year = date_time_obj.strftime("%Y")
    seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year),
               'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year),
               'autumn': pd.date_range(start='23/09/'+year, end='20/12/'+year)}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['autumn']:
        return 'autumn'
    else:
        return 'winter'

In [3]:
def day_of_week(date):
  date_time_obj = datetime.strptime(date, '%Y-%m-%d')
  return date_time_obj.weekday()

In [4]:
def month_of_year(date):
  date_time_obj = datetime.strptime(date, '%Y-%m-%d')
  return date_time_obj.month

In [5]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def preprocess_data(df):
  df['season'] = df.date.map(season_of_date)
  df['day_of_week'] = df.date.map(day_of_week)
  df['month'] = df.date.map(month_of_year)
  df = encode_and_bind(df, 'country')
  df = encode_and_bind(df, 'store')
  df = encode_and_bind(df, 'product')
  df = encode_and_bind(df, 'season')
  column = 'day_of_week'
  df[column] = MinMaxScaler().fit_transform(np.array(df[column]).reshape(-1,1))
  column = 'month'
  df[column] = MinMaxScaler().fit_transform(np.array(df[column]).reshape(-1,1))
  df.drop('date', axis=1, inplace=True)
  df.drop('row_id', axis=1, inplace=True)
  return df

In [7]:
X_train = preprocess_data(train_df)
X_train.head()

Unnamed: 0,day_of_week,month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,store_KaggleRama,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker,season_autumn,season_spring,season_summer,season_winter
0,0.5,0.0,1,0,0,1,0,0,1,0,0,0,0,1
1,0.5,0.0,1,0,0,1,0,1,0,0,0,0,0,1
2,0.5,0.0,1,0,0,1,0,0,0,1,0,0,0,1
3,0.5,0.0,1,0,0,0,1,0,1,0,0,0,0,1
4,0.5,0.0,1,0,0,0,1,1,0,0,0,0,0,1


##The model

In [8]:
import tensorflow as tf

tf.random.set_seed(42)

model = tf.keras.Sequential([
                             tf.keras.layers.Dense(100, activation='relu'),
                             tf.keras.layers.Dense(10, activation='relu'),
                             tf.keras.layers.Dense(1),
                             ])

model.compile(loss = tf.keras.losses.mae,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["mae"])

model.fit(X_train, Y_train, epochs=20,validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4912f52dd0>

##Predict

In [11]:
test_df = pd.read_csv('test.csv')
X_test = preprocess_data(test_df)
X_test.head()

Unnamed: 0,day_of_week,month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,store_KaggleRama,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker,season_autumn,season_spring,season_summer,season_winter
0,0.166667,0.0,1,0,0,1,0,0,1,0,0,0,0,1
1,0.166667,0.0,1,0,0,1,0,1,0,0,0,0,0,1
2,0.166667,0.0,1,0,0,1,0,0,0,1,0,0,0,1
3,0.166667,0.0,1,0,0,0,1,0,1,0,0,0,0,1
4,0.166667,0.0,1,0,0,0,1,1,0,0,0,0,0,1


In [196]:
prediction = model.predict(X_test)

In [197]:
submission = pd.DataFrame(prediction)
submission.insert(0,'row_id',test_df['row_id'])
submission.columns =['row_id', 'num_sold']

submission.num_sold = submission.num_sold.round()
submission.num_sold = submission.num_sold.astype(int)

In [198]:
submission.head()

Unnamed: 0,row_id,num_sold
0,26298,197
1,26299,356
2,26300,92
3,26301,348
4,26302,659


In [199]:
submission.to_csv('submission1.csv', index=False)

##Poly Regression

In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, Y_train)


LinearRegression()

In [12]:

poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_test)
pol_prediction = pol_reg.predict(X_poly)

In [13]:
submission1 = pd.DataFrame(pol_prediction)
submission1.insert(0,'row_id',test_df['row_id'])
submission1.columns =['row_id', 'num_sold']

submission1.num_sold = submission1.num_sold.round()
submission1.num_sold = submission1.num_sold.astype(int)

submission1.head()

Unnamed: 0,row_id,num_sold
0,26298,230
1,26299,372
2,26300,112
3,26301,395
4,26302,644


In [14]:
submission1.to_csv('submission4.csv', index=False)