In [2]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.4.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.4.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.4 MB 2.3 MB/s eta 0:00:01
   ------- -------------------------------- 0.3/1.4 MB 3.5 MB/s eta 0:00:01
   ------------- -------------------------- 0.5/1.4 MB 4.4 MB/s eta 0:00:01
   ---------------------- ----------------- 0.8/1.4 MB 5.3 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.4 MB 5.3 MB/s eta 0:00:01
   ----------------------------- ---------- 1.1/1.4 MB 5.2 MB/s eta 0:00:01
   ------------------------------ --------- 1.1/1.4 MB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 4.6 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMRegressor

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Combine train and test data for preprocessing
all_data = pd.concat([train, test], axis=0, sort=False)

# Data preprocessing

# Handle missing values
# Example: filling missing numeric values with mean and categorical with mode
numeric_cols = all_data.select_dtypes(include=np.number).columns
categorical_cols = all_data.select_dtypes(include='object').columns

all_data[numeric_cols] = all_data[numeric_cols].fillna(all_data[numeric_cols].mean())
all_data[categorical_cols] = all_data[categorical_cols].fillna(all_data[categorical_cols].mode().iloc[0])

# Encode categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

# Log-transform the target variable (SalePrice)
train['SalePrice'] = np.log1p(train['SalePrice'])

# Split back into train and test sets
train_processed = all_data.iloc[:train.shape[0], :]
test_processed = all_data.iloc[train.shape[0]:, :]

# Define features and target
X = train_processed.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']

# Model training and evaluation

# Initialize LightGBM model
model = LGBMRegressor()

# Cross-validation on the training set
cv_scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=5)
print(f'Cross-validation RMSE: {-cv_scores.mean()}')

# Fit the model on the entire training set
model.fit(X, y)

# Prediction

# Prepare test data for prediction
test_ids = test['Id']
X_test = test_processed.drop(['Id', 'SalePrice'], axis=1)

# Make predictions
predictions = np.expm1(model.predict(X_test))  # Reverse log transformation

# Prepare submission file
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})
submission.to_csv('submission.csv', index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3092
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 71
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3108
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 72
[LightGBM] [Info] Start training from score 12.023288
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug