In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Machine Learning/Forecasting Mini-Course Sales/train.csv')
df_train.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49


In [4]:
def des_ana(dataframe):
    print('------------------Shape---------------------')
    print(f'{dataframe.shape}\n\n')
    print('------------------Info---------------------')
    print(f'{dataframe.info()}\n\n')
    print('------------------Coumns---------------------')
    print(f'{dataframe.columns}\n\n')
    print('------------------Descritive---------------------')
    print(f'{dataframe.describe()}\n\n')
    print('------------------Null Values---------------------')
    print(f'{dataframe.isnull().sum()}\n\n')
    print('------------------Unique Values---------------------')
    print(f'{dataframe.nunique()}\n\n')

des_ana(df_train)

------------------Shape---------------------
(136950, 6)


------------------Info---------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        136950 non-null  int64 
 1   date      136950 non-null  object
 2   country   136950 non-null  object
 3   store     136950 non-null  object
 4   product   136950 non-null  object
 5   num_sold  136950 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.3+ MB
None


------------------Coumns---------------------
Index(['id', 'date', 'country', 'store', 'product', 'num_sold'], dtype='object')


------------------Descritive---------------------
                 id       num_sold
count  136950.00000  136950.000000
mean    68474.50000     165.522636
std     39534.20402     183.691575
min         0.00000       2.000000
25%     34237.25000      46.000000
50%     68474.50000      98.0

In [5]:
df_train['date'] = pd.to_datetime(df_train['date'])

In [6]:
X = df_train.drop(['id', 'date', 'num_sold'], axis=1).values
y = df_train['num_sold'].values

In [7]:
print(X[0,:])

['Argentina' 'Kaggle Learn' 'Using LLMs to Improve Your Coding']


In [8]:
encoder = OneHotEncoder()
X = encoder.fit_transform(X)

# Hyperparameter XGBoost

In [9]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [10]:
params ={
    'XGBoots':{
        'model': XGBRegressor(random_state=42),
        'params':{
            'n_estimators':[100,300,500],
            'max_depth':[5,10,15],
            'learning_rate':[0.1,0.15,0.2]
        }
    }
}

scores = []

for model_name, mp in params.items():
    clf = GridSearchCV(mp['model'], mp['params'],cv=5, return_train_score=False)
    clf.fit(X,y)
    print(clf.best_score_)
    print(clf.best_params_)

0.9409270871371114
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}


In [None]:
print(clf.cv_results_)

# Final train

In [12]:
df_test = pd.read_csv('/content/drive/MyDrive/Machine Learning/Forecasting Mini-Course Sales/test.csv')
df_test.head()

Unnamed: 0,id,date,country,store,product
0,136950,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding
1,136951,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs
2,136952,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People
3,136953,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions
4,136954,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better


In [13]:
des_ana(df_test)

------------------Shape---------------------
(27375, 5)


------------------Info---------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27375 entries, 0 to 27374
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       27375 non-null  int64 
 1   date     27375 non-null  object
 2   country  27375 non-null  object
 3   store    27375 non-null  object
 4   product  27375 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB
None


------------------Coumns---------------------
Index(['id', 'date', 'country', 'store', 'product'], dtype='object')


------------------Descritive---------------------
                  id
count   27375.000000
mean   150637.000000
std      7902.626146
min    136950.000000
25%    143793.500000
50%    150637.000000
75%    157480.500000
max    164324.000000


------------------Null Values---------------------
id         0
date       0
country    0
store      0
product    0


In [20]:
X_test = df_test.iloc[:,2:].values

In [22]:
X_test = encoder.fit_transform(X_test)

In [23]:
xgb = XGBRegressor(learning_rate=0.1, max_depth=10, n_estimators=100, random_state=42)
xgb.fit(X,y)
y_pred = xgb.predict(X_test)

In [24]:
y_pred

array([ 29.88474 ,  29.506907,   4.237578, ...,  68.59347 , 370.50928 ,
       336.61697 ], dtype=float32)

In [25]:
result = pd.DataFrame(y_pred, columns=['num_sold'])

In [30]:
submission = pd.concat([df_test['id'],result], axis=1).to_csv('submission.csv', index=False)