In [12]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [13]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


# Clean data

In [14]:
def clean_data(df) -> pd.DataFrame:
    df = df.copy()
    df.drop('row_id',axis=1, inplace=True) # Not relevant

    df['date'] = pd.to_datetime(df['date'])

    # Encode countries
    le = LabelEncoder().fit(df['country'])
    df['country'] = le.transform(df['country'])
    
    # Encode stores
    le = LabelEncoder().fit(df['store'])
    df['store'] = le.transform(df['store'])
    
    # Encode product
    le = LabelEncoder().fit(df['product'])
    df['product'] = le.transform(df['product'])

    df['date'] = df['date'].values.astype(float)

    return df

# Split and train model
Current nr 1 on Kaggle has score of 4.7, Goal: top 100 under score of 6

In [15]:
train = clean_data(train_df)

train_X_df = train.drop('num_sold', axis=1)
train_y_df = train['num_sold']
X_train, X_test, y_train, y_test = train_test_split(train_X_df, train_y_df, test_size=0.2)

### Accuracy is calculated using SMAPE

In [16]:
def accuracy(real,preds):
    """Function to calculate accuracy using SMAPE"""
    return 1/len(real) * np.sum(2 * np.abs(real - preds) / (np.abs(real) + np.abs(preds)) * 100)

### Test linear regression as baseline

In [26]:
%%time
from sklearn.linear_model import LinearRegression
clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', LinearRegression())
    ])
clf.fit(X_train, y_train)

print(f"Linear regression score on training data: {accuracy(clf.predict(X_train), y_train)}")
print(f"Linear regression score on testing data: {accuracy(clf.predict(X_test), y_test)}")

Linear regression score on training data: 35.562521578878794
Linear regression score on testing data: 35.817697790010115
CPU times: user 28.1 ms, sys: 3.71 ms, total: 31.8 ms
Wall time: 55.2 ms


### Test of Nearest neighbors regression
Did a lot better than Linear regression but long way to go...

In [24]:
%%time
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

best_test = float('inf')
best_train = float('inf')
best_k = 0
for neighbor in range(1, 101):
    clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', KNeighborsRegressor(n_neighbors=neighbor))
    ])
    clf.fit(X_train, y_train)
    test_score = accuracy(clf.predict(X_test), y_test)
    train_score = accuracy(clf.predict(X_train), y_train)
    if test_score < best_test:
        best_test = test_score
        best_k = neighbor
        best_train = train_score
        
print(f"Accuracy at best k on training data: {train_score}")
print(f"Best accuracy at k = {best_k} with testing accuracy of {test_score}")

Accuracy at best k on training data: 12.12072829176502
Best accuracy at k = 12.355242141649832 with testing accuracy of 12.355242141649832
CPU times: user 1min 1s, sys: 2.88 s, total: 1min 4s
Wall time: 1min 12s


### Test of Random Forest Regressor
Got a testing accuracy of 8.9 without any hyperparameter tuning, getting alot closer

In [19]:
%%time
from sklearn.ensemble import RandomForestRegressor

test_estimators = False
best_acc = float('inf')
best_estimator = 0
train_acc_at_best_k = float('inf')
for estimator in range(10, 200, 10):
    model = RandomForestRegressor(n_estimators=estimator, n_jobs=-1)
    model.fit(X_train, y_train)
    curr_accuracy = accuracy(model.predict(X_test), y_test)
    if curr_accuracy < best_acc:
        best_acc = curr_accuracy
        best_estimator = estimator
        train_acc_at_best_k = accuracy(model.predict(X_train), y_train)

print(f"Accuracy at best estimator on training data: {train_acc_at_best_k}")
print(f"Best accuracy at estimator = {best_estimator} with testing accuracy of {best_acc}")

Accuracy at best estimator on training data: 3.3337512396356415
Best accuracy at estimator = 160 with testing accuracy of 9.03137694766215
CPU times: user 1min 8s, sys: 3.2 s, total: 1min 11s
Wall time: 26.5 s


### Test of sklearns GradientBoostingRegressor
Best testing score yet of 7.05, more hyperparameter tuning available to perfect

In [25]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=10000)
model.fit(X_train, y_train)

print(f"Gradient boost regression score on training data: {accuracy(model.predict(X_train), y_train)}")
print(f"Gradient boost regression score on testing data: {accuracy(model.predict(X_test), y_test)}")

Gradient boost regression score on training data: 5.5370020200551755
Gradient boost regression score on testing data: 7.032682567187726
CPU times: user 1min 17s, sys: 1.28 s, total: 1min 18s
Wall time: 1min 27s
