# My try on TPS January
### Tested many different basic methods, ended up with XGBoost for submission

TODO:
* Feature importance
* Better Hyperparameter tuning
* Test different models

### Please let me know of any improvements and feel free to use this as inspiration

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [28]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


# Clean data

In [52]:
def clean_data(df) -> pd.DataFrame:
    df = df.copy()
    df.drop('row_id', axis=1, inplace=True) # Not relevant

    # Encode countries
    le = LabelEncoder().fit(df['country'])
    df['country'] = le.transform(df['country'])
    
    # Encode stores
    le = LabelEncoder().fit(df['store'])
    df['store'] = le.transform(df['store'])
    
    # Encode product
    le = LabelEncoder().fit(df['product'])
    df['product'] = le.transform(df['product'])

    # Make categorical data from datetime: 
    # inspo from https://stackoverflow.com/questions/16453644/regression-with-date-variable-using-scikit-learn
    df[['year', 'month', 'day']] = df['date'].str.split('-', expand=True).astype(int)
    df.drop('date', axis=1, inplace=True)

    return df

# Split and train model
Current nr 1 on Kaggle has score of 4.7, Goal: get under 5

In [53]:
train = clean_data(train_df)

train_X_df = train.drop('num_sold', axis=1)
train_y_df = train['num_sold']
X_train, X_test, y_train, y_test = train_test_split(train_X_df, train_y_df, test_size=0.1, shuffle=False)

In [54]:
X_test.head()

Unnamed: 0,country,store,product,year,month,day
23668,2,1,0,2018,8,7
23669,2,1,2,2018,8,7
23670,0,0,1,2018,8,8
23671,0,0,0,2018,8,8
23672,0,0,2,2018,8,8


### Accuracy is calculated using SMAPE

In [55]:
def accuracy(real,preds):
    """Function to calculate accuracy using SMAPE"""
    return 1/len(real) * np.sum(2 * np.abs(real - preds) / (np.abs(real) + np.abs(preds)) * 100)

### Test linear regression as baseline

In [56]:
%%time
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', LinearRegression())
    ])
clf.fit(X_train, y_train)

print(f"Linear regression score on training data: {accuracy(clf.predict(X_train), y_train)}")
print(f"Linear regression score on testing data: {accuracy(clf.predict(X_test), y_test)}")

Linear regression score on training data: 37.09181591743494
Linear regression score on testing data: 34.82408552997975
CPU times: user 35.6 ms, sys: 15 ms, total: 50.5 ms
Wall time: 79.1 ms


### Test of Nearest neighbors regression
Did a lot better than Linear regression with a score of 12.35 but long way to go...

In [57]:
%%time
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

best_test = float('inf')
best_train = float('inf')
best_k = 0
for neighbor in range(1, 101):
    clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', KNeighborsRegressor(n_neighbors=neighbor))
    ])
    clf.fit(X_train, y_train)
    test_score = accuracy(clf.predict(X_test), y_test)
    train_score = accuracy(clf.predict(X_train), y_train)
    if test_score < best_test:
        best_test = test_score
        best_k = neighbor
        best_train = train_score
        
print(f"Accuracy at best k on training data: {train_score}")
print(f"Best accuracy at k = {best_k} with testing accuracy of {test_score}")

KeyboardInterrupt: 

### Test of Random Forest Regressor
Got a testing accuracy of 8.9 without any hyperparameter tuning, getting alot closer

In [45]:
%%time
from sklearn.ensemble import RandomForestRegressor

best_acc = float('inf')
best_estimator = 0
train_acc_at_best_k = float('inf')
for estimator in range(10, 200, 10):
    model = RandomForestRegressor(n_estimators=estimator, n_jobs=-1)
    model.fit(X_train, y_train)
    curr_accuracy = accuracy(model.predict(X_test), y_test)
    if curr_accuracy < best_acc:
        best_acc = curr_accuracy
        best_estimator = estimator
        train_acc_at_best_k = accuracy(model.predict(X_train), y_train)

print(f"Accuracy at best estimator on training data: {train_acc_at_best_k}")
print(f"Best accuracy at estimator = {best_estimator} with testing accuracy of {best_acc}")

Accuracy at best estimator on training data: 3.6564272909548983
Best accuracy at estimator = 60 with testing accuracy of 11.135844573378282
CPU times: user 1min 6s, sys: 3.23 s, total: 1min 9s
Wall time: 42.3 s


### Test of sklearns GradientBoostingRegressor
Best testing score yet of 6.9, more hyperparameter tuning available to perfect

In [46]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

clf = Pipeline([
        ('scaler', RobustScaler()),
        ('gbr', GradientBoostingRegressor(n_estimators=10000))
    ])
clf.fit(X_train, y_train)

print(f"Gradient boost regression score on training data: {accuracy(clf.predict(X_train), y_train)}")
print(f"Gradient boost regression score on testing data: {accuracy(clf.predict(X_test), y_test)}")

Gradient boost regression score on training data: 9.152989045327187
Gradient boost regression score on testing data: 22.73724031648808
CPU times: user 1min 11s, sys: 1.1 s, total: 1min 12s
Wall time: 1min 34s


### Test of XGBoost

In [None]:
%%time
from xgboost import XGBRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import make_scorer

params = {
            'regressor__learning_rate': np.linspace(0.03, 0.5, 5), 
            'regressor__max_depth': np.linspace(6,9,4, dtype=int),
            'regressor__n_estimators': [500]
}


pipe = Pipeline([
        ('scaler', RobustScaler()),
        ('regressor', XGBRegressor())
    ])
    
search = HalvingGridSearchCV(pipe, params, n_jobs=-1, scoring=make_scorer(accuracy, greater_is_better=False))
search.fit(X_train, y_train)

print(search.best_params_)

print(f"XGboost regression score on training data: {accuracy(search.predict(X_train), y_train)}")
print(f"XGboost regression score on testing data: {accuracy(search.predict(X_test), y_test)}")
"""{'regressor__learning_rate': 0.1475, 'regressor__max_depth': 7, 'regressor__n_estimators': 500}"""

KeyboardInterrupt: 

### XGBoost

In [65]:
%%time
model = XGBRegressor()
model.fit(X_train, y_train)

print(f"XGboost regression score on training data: {accuracy(y_train, model.predict(X_train))}")
print(f"XGboost regression score on testing data: {accuracy(y_test, model.predict(X_test))}")

XGboost regression score on training data: 6.915370884240903
XGboost regression score on testing data: 13.045642627959078
CPU times: user 2.78 s, sys: 57.8 ms, total: 2.84 s
Wall time: 1.06 s


## Submission of best model:

In [70]:
# Reimport to avoid any eventual changes
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")

train = clean_data(train_df)
test = clean_data(test_df)

X_train = train.drop('num_sold',axis=1)
y_train = train['num_sold']

model = XGBRegressor()
model.fit(X_train, y_train)
preds = model.predict(test)

In [71]:
submission_df.num_sold = preds
submission_df.to_csv("Submission.csv", index=False)