In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# Function to preprocess date
def preprocess_date(df):
    df['year'] = pd.to_datetime(df['date']).dt.year
    df['month'] = pd.to_datetime(df['date']).dt.month
    df['day'] = pd.to_datetime(df['date']).dt.day
    return df.drop(columns=['date'])

# Function to extract numeric and categorical features
def get_column_types(df):
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    return numeric_features, categorical_features[categorical_features != 'date']

# Function to create a preprocessor
def create_preprocessor(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    return ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Function to create models
def create_models(preprocessor):
    return {
        'Linear Regression': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', LinearRegression())
        ]),
        'Random Forest': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor())
        ]),
        'XGBoost': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', XGBRegressor())
        ])
    }

# Function to evaluate models
def evaluate_models(models, x, y, scoring='neg_mean_absolute_percentage_error'):
    best_model = None
    best_score = float('inf')
    for name, model in models.items():
        scores = -cross_val_score(model, x, y, cv=5, scoring=scoring)
        score = np.mean(scores)
        logging.info(f'{name}: {score}')
        if score < best_score:
            best_score = score
            best_model = model
    logging.info(f'Best model: {best_model}')
    logging.info(f'Best score: {best_score}')
    return best_model

# Function to fill missing values
def fill_missing_values(train, best_model):
    train = preprocess_date(train.copy())
    train_filled = train[train['num_sold'].notna()]
    train_missing = train[train['num_sold'].isna()]
    X_train_filled = train_filled.drop(columns=['num_sold', 'id'])
    y_train_filled = train_filled['num_sold']
    X_train_missing = train_missing.drop(columns=['num_sold', 'id'])
    best_model.fit(X_train_filled, y_train_filled)
    train_missing['num_sold'] = best_model.predict(X_train_missing)
    return pd.concat([train_filled, train_missing]).sort_index()

# Function to predict and save results
def predict_and_save(model, test, filename='predictions.csv'):
    test_prepared = preprocess_date(test.drop(columns=['id']))
    predictions = model.predict(test_prepared)
    results = pd.DataFrame({'id': test['id'], 'num_sold': predictions})
    results.to_csv(filename, index=False)
    return results

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Prepare training data
train = train.dropna(subset=['num_sold'])
x = preprocess_date(train.drop(columns=['id', 'num_sold']))
y = train['num_sold']
numeric_features, categorical_features = get_column_types(x)
preprocessor = create_preprocessor(numeric_features, categorical_features)
models = create_models(preprocessor)

# Evaluate models
best_model = evaluate_models(models, x, y)

# Fill missing values
train_complete = fill_missing_values(train, best_model)

# Prepare complete data
x = train_complete.drop(columns=['id', 'num_sold'])
y = train_complete['num_sold']

# Fit the best model
best_model.fit(x, y)

# Predict and save results
results = predict_and_save(best_model, test)