##   Import danych

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

## Zaczytanie danych i eksploracja

In [97]:
# wczytywanie danych
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [98]:
#informacje o danych
print(train.info())
print(train.describe())
print(test.info())
print(test.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB
None
                  id       num_sold
count  230130.000000  221259.000000
mean   115064.500000     752.527382
std     66432.953062     690.165445
min         0.000000       5.000000
25%     57532.250000     219.000000
50%    115064.500000     605.000000
75%    172596.750000    1114.000000
max    230129.000000    5939.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ---

In [99]:
#sprawdzenie braków danych
print(train.isnull().sum())
print(test.isnull().sum())

id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64
id         0
date       0
country    0
store      0
product    0
dtype: int64


In [100]:
#kopia danych
train_copy = train.copy()

## Definicja funkcji

In [101]:
def preprocess_date(df):
    df['year'] = pd.to_datetime(df['date']).dt.year
    df['month'] = pd.to_datetime(df['date']).dt.month
    df['day'] = pd.to_datetime(df['date']).dt.day

    # Usuń oryginalną kolumnę daty
    df = df.drop(columns=['date'])
    return df

In [102]:
def get_colls_to_preprocessing(df):
    numeric_features = df.select_dtypes(include=['int64','float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    categorical_features = categorical_features[categorical_features != 'date']
    return numeric_features, categorical_features

In [103]:
# Funkcja tworząca preprocesor
def create_preprocessor(numeric_features, categorical_features):
    #preprocessor 
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Uzupełnianie średnią
        ('scaler', StandardScaler())                 # Skalowanie danych
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Uzupełnianie najczęstszą wartością
        ('onehot', OneHotEncoder(handle_unknown='ignore'))     # OneHotEncoder dla kategorii
    ])
    return ColumnTransformer(  #zwraca obiekt ColumnTransformer
    transformers=[
        ('num', numeric_transformer, numeric_features),      # Numeryczne kolumny
        ('cat', categorical_transformer, categorical_features),  # Kategoryczne kolumny
    ]
)

In [104]:
# Funkcja tworząca model
def get_models(preprocessor):
    models = {
    'Linear Regression': Pipeline(steps=[ #wywołanie Pipeline z dwoma krokami: preprocessor i model
        ('preprocessor', preprocessor),
        ('model', LinearRegression()) #stworzenie instancji modelu
    ]),
    'Random Forest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]),
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ])
}
    return models

In [105]:
# Funkcja oceniająca modele
def get_best_model_score(models, x, y):
    best_model = None
    best_score= float('inf')

    for name, model in models.items():
        scores = -cross_val_score(model, x, y, cv=5, scoring='neg_mean_absolute_percentage_error')
        score = np.mean(scores)
        print(f'{name}: {score}')
        if score < best_score:
            best_score = score
            best_model = model
            
    return best_model, best_score

## Trenowanie 1 modelu

In [106]:
# Usuń wiersze z brakującymi wartościami w 'num_sold' zarówno z x, jak i y
train = train.dropna(subset=['num_sold'])

# Zaktualizuj zmienne x i y po usunięciu brakujących danych
x = train.drop(columns=['id', 'num_sold'], axis=1)
y = train['num_sold']

In [107]:
# Przetwarzanie daty
x = preprocess_date(x)

# Pobieranie kolumn do przetwarzania
numeric_features, categorical_features = get_colls_to_preprocessing(x)

# Tworzenie preprocesora
preprocesor = create_preprocessor(numeric_features, categorical_features)

# Tworzenie modeli i wybór najlepszego
models = get_models(preprocesor)

best_model, best_score = get_best_model_score(models, x, y)

#Fitting najlepszego modelu
best_model.fit(x, y)

Linear Regression: 4.390755961409231
Random Forest: 0.1528364045771197
XGBoost: 0.15283456747451746


### użycie modelu do przewidywania num_sold w train_copy

In [108]:
train_copy = preprocess_date(train_copy)

# Podział danych na brakujące i pełne wartości
train_filled = train_copy[train_copy['num_sold'].notna()]
train_missing = train_copy[train_copy['num_sold'].isna()]

# Podział danych na x i y
x_train_filled = train_filled.drop(columns=['id', 'num_sold'], axis=1)
y_train_filled = train_filled['num_sold']

x_train_missing = train_missing.drop(columns=['id', 'num_sold'], axis=1)

# predykcja
y_train_missing = best_model.predict(x_train_missing)

# Uzupełnienie braków w danych
train_missing['num_sold'] = y_train_missing

# Połączenie danych w całość
train_complete = pd.concat([train_filled, train_missing]).sort_index()

# Wyświetlenie wyników
print(train_complete)

            id    country                 store             product  \
0            0     Canada     Discount Stickers   Holographic Goose   
1            1     Canada     Discount Stickers              Kaggle   
2            2     Canada     Discount Stickers        Kaggle Tiers   
3            3     Canada     Discount Stickers            Kerneler   
4            4     Canada     Discount Stickers  Kerneler Dark Mode   
...        ...        ...                   ...                 ...   
230125  230125  Singapore  Premium Sticker Mart   Holographic Goose   
230126  230126  Singapore  Premium Sticker Mart              Kaggle   
230127  230127  Singapore  Premium Sticker Mart        Kaggle Tiers   
230128  230128  Singapore  Premium Sticker Mart            Kerneler   
230129  230129  Singapore  Premium Sticker Mart  Kerneler Dark Mode   

           num_sold  year  month  day  
0        127.736832  2010      1    1  
1        973.000000  2010      1    1  
2        906.000000  2010  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_missing['num_sold'] = y_train_missing


## Trenowanie 2 modelu

In [109]:
# podział danych na x i y
x_complete = train_complete.drop(columns=['id', 'num_sold'], axis=1)
y_complete = train_complete['num_sold']

In [110]:
# kolumny do przetwarzania
numeric_features_complete, categorical_features_complete = get_colls_to_preprocessing(x_complete)

# Tworzenie preprocesora
preprocesor_complete = create_preprocessor(numeric_features_complete, categorical_features_complete)

# Tworzenie modeli i wybór najlepszego
models_complete = get_models(preprocesor_complete)

# przypisanie najlepszego modelu do zmiennej
best_model_complete, best_score_complete = get_best_model_score(models_complete, x_complete, y_complete)
print(f'The best model is {best_model_complete} with score {best_score_complete}')

# Fitting najlepszego modelu
best_model_complete.fit(x_complete, y_complete)

Linear Regression: 6.102887950315592
Random Forest: 0.14696988698255328
XGBoost: 0.146971987035912
The best model is Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index([], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                  

### Zapisanie predykcji w pliku csv

In [111]:
# Przetwarzanie daty w testowym zbiorze
test = preprocess_date(test)

# Predykcja
predictions = best_model_complete.predict(test.drop(columns=['id'], axis=1))

# Zapisanie wyników
results = pd.DataFrame({
    'id': test['id'],  # Pobranie ID z testowego DataFrame
    'num_sold': predictions  # Wyniki przewidywań
})

# Zapisanie wyników do pliku CSV (opcjonalne)
results.to_csv('predictions.csv', index=False)