In [28]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.5.2
# #!pip uninstall pandas -y
# #!pip install pandas==2.1.4


Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Successfully uninstalled scikit-learn-1.5.2
Collecting scikit-learn==1.5.2
  Using cached scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../data/raw/blood_glucose/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

../data/raw/blood_glucose/activities.txt
../data/raw/blood_glucose/sample_submission.csv
../data/raw/blood_glucose/test.csv
../data/raw/blood_glucose/train.csv
../data/raw/blood_glucose/.ipynb_checkpoints\activities-checkpoint.txt
../data/raw/blood_glucose/.ipynb_checkpoints\sample_submission-checkpoint.csv
../data/raw/blood_glucose/.ipynb_checkpoints\test-checkpoint.csv
../data/raw/blood_glucose/.ipynb_checkpoints\train-checkpoint.csv


In [6]:
# #!pip install optuna
# !pip uninstall lightgbm -y
# !pip install lightgbm --config-settings=cmake.define.USE_CUDA=ON

In [10]:
import lightgbm
import numpy as np
def check_gpu_support():
    try:
        data = np.random.rand(50, 2)
        label = np.random.randint(2, size=50)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'num_iterations': 1, 'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False
check_gpu_support()



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 36
[LightGBM] [Info] Number of data points in the train set: 50, number of used features: 2
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2 dense feature groups (0.00 MB) transferred to GPU in 0.000206 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.460000


True

In [12]:

# check xgboost version
import xgboost as xg

In [32]:

import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error
import gc  # Import garbage collection for manual memory management

def remove_columns(df, columns_to_remove):
    """
    Remove specified columns from the DataFrame.

    :param df: Input DataFrame.
    :param columns_to_remove: List of column names to remove.
    :return: DataFrame with specified columns removed.
    """
    df = df.drop(columns=columns_to_remove, errors='ignore')
    return df

# Function to reduce memory usage by downcasting numerical data types
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            if pd.api.types.is_integer_dtype(df[col]):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            else:
                df[col] = pd.to_numeric(df[col], downcast='float')
        else:
            # Convert object types to category type for memory efficiency
            df[col] = df[col].astype('category')
    
    return df

# Function to preprocess data and train model with Optuna for hyperparameter tuning
def preprocess_and_train_with_optuna(dataset_path, target_variable, columns_to_remove=None, n_trials=3, use_knn_imputer=True, chunk_size=5000):
    
    # Load the dataset in chunks for memory efficiency
    chunks = pd.read_csv(dataset_path, chunksize=chunk_size)

    # Concatenate chunks into a single DataFrame while reducing memory usage
    data = pd.concat([reduce_memory_usage(chunk) for chunk in chunks])
    
    if columns_to_remove is not None:
        data = remove_columns(data, columns_to_remove)
        
    # Explicit garbage collection to free up memory
    gc.collect()

    # Check if target variable exists in dataset
    if target_variable not in data.columns:
        raise ValueError(f"{target_variable} not found in the dataset")

    # Separate features and target
    X = data.drop(columns=[target_variable])
    y = data[target_variable]

    # Optimize target column by converting to float32
    y = y.astype(np.float32)

    # Identify numerical and categorical columns
    numerical_features = X.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns

    # Intelligent imputation for numerical features
    if use_knn_imputer:
        # Use KNN Imputer for numerical columns (more advanced, but can be heavy on memory)
        numerical_imputer = KNNImputer(n_neighbors=3)
    else:
        # Use Median Imputer for numerical columns
        numerical_imputer = SimpleImputer(strategy='median')

    # Preprocessing for numerical features (KNN Imputation or Median Imputation and Scaling)
    numerical_transformer = Pipeline(steps=[
        ('imputer', numerical_imputer),
        ('scaler', StandardScaler())
    ])

    # Intelligent imputation for categorical features (add 'Missing' as a new category)
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine numerical and categorical transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define the objective function for Optuna
    def objective(trial):
        # Define hyperparameter search space
        xgboost_param = {
            'device':'gpu',
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'gpu_hist',  # Enable GPU for XGBoost
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        }

        # Create LightGBM model
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', xg.XGBRegressor(**xgboost_param, random_state=42))
        ])
        print("Training")
        # Cross-validation with 5-fold to evaluate the performance using RMSE
        cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
        rmse = -np.mean(cv_scores)  # RMSE (scikit-learn uses negative RMSE by default)
        return rmse

    # Create an Optuna study and optimize it
    study = optuna.create_study(direction='minimize')
   
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train the final model using the best hyperparameters
    best_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(**best_params, random_state=42))
    ])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    best_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test)

    # Calculate and print final performance using RMSE
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f'Test RMSE: {rmse}')

    # Explicit garbage collection
    gc.collect()

    return best_model, study




In [34]:
dataset_path = pd.read_csv(r'../data/raw/blood_glucose/train.csv')
target_variable = 'bg+1:00'

best_model, study = preprocess_and_train_with_optuna(dataset_path, target_variable)#, columns_to_remove=None, n_trials=3, use_knn_imputer=True, chunk_size=5000):
   

  dataset_path = pd.read_csv(r'../data/raw/blood_glucose/train.csv')


TypeError: argument of type 'method' is not iterable

In [22]:

# Predict function that accepts a DataFrame
def predict(model, input_data,columns_to_remove = None):
    """
    Predicts target values using the trained model on new input data.

    :param model: Trained regression model pipeline
    :param input_data: Pandas DataFrame containing input features
    :return: Predicted values
    """
    # Ensure input data is in DataFrame format
    if not isinstance(input_data, pd.DataFrame):
        raise ValueError("Input data must be a pandas DataFrame")
    
    if columns_to_remove is not None:
        data = remove_columns(input_data, columns_to_remove)
    
    # Predict using the trained model
    predictions = model.predict(data)

    return predictions     

In [8]:
import pickle
with open('/kaggle/input/trainedlightgbm/scikitlearn/default/1/finalmodel.pkl', 'rb') as file:
    model = pickle.load(file)

In [9]:
# Example DataFrame for prediction (replace with actual data)
test =  pd.read_csv('/kaggle/input/brist1d/test.csv')

# Get predictions
predictions = predict(model, test,columns_to_remove=['id'])




In [10]:
test['bg+1:00'] = predictions

In [11]:
final_sub_df = test[['id','bg+1:00']]

In [12]:
final_sub_df.to_csv('submission.csv',index = False)