In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.feature_selection import mutual_info_regression
import category_encoders as ce

# Define data types for columns in CSV files
dtype_spec = {
    'number': str,
    'grid': 'float64',
    'position_x': 'float64',
    'positionOrder': 'float64',
    'time_x': str,
    'timetaken_in_millisec': 'float64',
    'fastestLapTime': str,
    'max_speed': 'float64',
    'url_x': str,
    'url_y': str,
    'url': str,
    'date': str,
    'dob': str
}

# Define values to be treated as NA when reading CSV files
na_values = ['\\N', 'null', 'None', '']

# Function to process a chunk of data
def process_chunk(df, is_train=True):
    # Convert time columns to seconds
    df['time_x'] = df['time_x'].apply(lambda x: pd.to_timedelta(x).total_seconds() if pd.notnull(x) else np.nan)
    df['fastestLapTime'] = df['fastestLapTime'].apply(lambda x: pd.to_timedelta(x).total_seconds() if pd.notnull(x) else np.nan)
    
    # Convert date columns
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
    df['dob'] = pd.to_datetime(df['dob'], format='%Y-%m-%d', errors='coerce')
    
    # Feature engineering
    df['age'] = df['date'].dt.year - df['dob'].dt.year
    df['experience'] = df.groupby('driverRef')['date'].rank(method='dense')
    df['track_familiarity'] = df.groupby(['driverRef', 'grand_prix'])['date'].rank(method='dense')
    df['team_performance'] = df.groupby(['constructorRef', 'date'])['points'].transform('mean')
    if is_train and 'position' in df.columns:
        df['quali_performance'] = df['grid'] - df['position']
    
    return df

# Load and process train data in chunks
chunk_size = 100000  # Adjust this based on your available memory
train_chunks = []
for chunk in pd.read_csv('train.csv', dtype=dtype_spec, na_values=na_values, chunksize=chunk_size):
    processed_chunk = process_chunk(chunk, is_train=True)
    train_chunks.append(processed_chunk)
train_df = pd.concat(train_chunks, ignore_index=True)

# Load and process test data in chunks
test_chunks = []
for chunk in pd.read_csv('test.csv', dtype=dtype_spec, na_values=na_values, chunksize=chunk_size):
    processed_chunk = process_chunk(chunk, is_train=False)
    test_chunks.append(processed_chunk)
test_df = pd.concat(test_chunks, ignore_index=True)

# Separate numerical and categorical columns
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove 'position' from features if it's present
if 'position' in numeric_features:
    numeric_features.remove('position')

# Imputation
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Impute numeric features
train_df[numeric_features] = numeric_imputer.fit_transform(train_df[numeric_features])
test_df[numeric_features] = numeric_imputer.transform(test_df[numeric_features])

# Impute categorical features
train_df[categorical_features] = categorical_imputer.fit_transform(train_df[categorical_features])
test_df[categorical_features] = categorical_imputer.transform(test_df[categorical_features])

# Enhanced encoding
encoder = ce.TargetEncoder(cols=categorical_features)
train_df = encoder.fit_transform(train_df, train_df['position'])
test_df = encoder.transform(test_df)

# Feature selection
def select_features(X, y, threshold=0.01):
    mi_scores = mutual_info_regression(X, y)
    selected_features = X.columns[mi_scores > threshold].tolist()
    return selected_features

features = select_features(train_df.drop('position', axis=1), train_df['position'])

# Prepare data for training
X = train_df[features]
y = train_df['position']

# Advanced feature scaling
scaler = PowerTransformer(method='yeo-johnson')
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
test_df[features] = scaler.transform(test_df[features])

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna objective function
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True)
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    pruning_callback = LightGBMPruningCallback(trial, 'rmse')
    model = lgb.train(
        param,
        train_data,
        num_boost_round=10000,
        valid_sets=[valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            pruning_callback
        ]
    )

    y_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    return rmse

# Optimize hyperparameters
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100)

# Train final model
best_params = study.best_params
best_params['objective'] = 'regression'
best_params['metric'] = 'rmse'
best_params['verbosity'] = -1

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=10000,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True)]
)

# Evaluate model
y_pred = final_model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
mae = mean_absolute_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

print(f'Validation RMSE: {rmse}')
print(f'Validation MAE: {mae}')
print(f'Validation R2 Score: {r2}')

# Make predictions on test set
X_test = test_df[features]
test_predictions = final_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'resultId': test_df['resultId'],
    'position': test_predictions
})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

  for chunk in pd.read_csv('train.csv', dtype=dtype_spec, na_values=na_values, chunksize=chunk_size):


ValueError: no units specified

In [3]:
!pip install optuna-integration

Collecting optuna-integration
  Downloading optuna_integration-3.6.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna_integration-3.6.0-py3-none-any.whl (93 kB)
   ---------------------------------------- 0.0/93.4 kB ? eta -:--:--
   -------- ------------------------------- 20.5/93.4 kB 320.0 kB/s eta 0:00:01
   ---------------------------------------- 93.4/93.4 kB 1.3 MB/s eta 0:00:00
Installing collected packages: optuna-integration
Successfully installed optuna-integration-3.6.0


In [5]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/81.9 kB ? eta -:--:--
   ----------------------------------- ---- 71.7/81.9 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 81.9/81.9 kB 1.1 MB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [7]:
!pip install pandas numpy scikit-learn lightgbm optuna category_encoders

