In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

Transformacja danych

In [21]:
df = pd.read_csv('train.csv', sep=',')


# Krok 1: Usunięcie kolumny 'id'
df.drop(columns=['id'], inplace=True)

# Krok 2: Zmiana danych kategorycznych na liczbowe (Label Encoding)
categorical_columns = ['country', 'store', 'product']  # Kategoryczne kolumny

# Krok 3: Normalizacja numeryczna (Min-Max Scaling) dla num_sold
scaler = MinMaxScaler()

# Krok 4: Dodanie kolumny z dniem tygodnia (i konwersja na liczbę)
df['date'] = pd.to_datetime(df['date'])
df['weekday'] = df['date'].dt.weekday

# Teraz zamienimy kategorie na liczby
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Zapamiętujemy encodery, żeby móc przekształcać później

# Normalizowanie kolumny num_sold
df['num_sold'] = scaler.fit_transform(df[['num_sold']])

# Ostateczny wynik
print(df)


             date  country  store  product  num_sold  weekday
0      2010-01-01        0      0        0       NaN        4
1      2010-01-01        0      0        1  0.163128        4
2      2010-01-01        0      0        2  0.151837        4
3      2010-01-01        0      0        3  0.070442        4
4      2010-01-01        0      0        4  0.081901        4
...           ...      ...    ...      ...       ...      ...
230125 2016-12-31        5      1        0  0.077688        5
230126 2016-12-31        5      1        1  0.489046        5
230127 2016-12-31        5      1        2  0.386586        5
230128 2016-12-31        5      1        3  0.208460        5
230129 2016-12-31        5      1        4  0.272497        5

[230130 rows x 6 columns]


uzupełnianie braków w num_sold

In [24]:
imputer = KNNImputer(n_neighbors=5)  # Możesz dostosować liczbę sąsiadów
df_imputed = pd.DataFrame(imputer.fit_transform(df.select_dtypes(include=['float64', 'int64'])))
df[df_imputed.columns] = df_imputed  # Uzupełnianie brakujących danych
df

Unnamed: 0,date,country,store,product,num_sold,weekday,0,1,2,3
0,2010-01-01,0,0,0,,4,0.0,0.0,0.0,0.081699
1,2010-01-01,0,0,1,0.163128,4,0.0,0.0,1.0,0.163128
2,2010-01-01,0,0,2,0.151837,4,0.0,0.0,2.0,0.151837
3,2010-01-01,0,0,3,0.070442,4,0.0,0.0,3.0,0.070442
4,2010-01-01,0,0,4,0.081901,4,0.0,0.0,4.0,0.081901
...,...,...,...,...,...,...,...,...,...,...
230125,2016-12-31,5,1,0,0.077688,5,5.0,1.0,0.0,0.077688
230126,2016-12-31,5,1,1,0.489046,5,5.0,1.0,1.0,0.489046
230127,2016-12-31,5,1,2,0.386586,5,5.0,1.0,2.0,0.386586
230128,2016-12-31,5,1,3,0.208460,5,5.0,1.0,3.0,0.208460


In [31]:
df_filtered = df[["date", "country", "store", "product", "weekday", 3]]  

# Zmieniamy nazwę kolumny 3 na 'num_sold'
df_filtered = df_filtered.rename(columns={3: "num_sold"})
df_filtered
df_filtered.to_csv('train_transform.csv', index=False)

In [28]:
print(df.columns)

Index(['date', 'country', 'store', 'product', 'num_sold', 'weekday', 0, 1, 2,
       3],
      dtype='object')
