In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
import os
import pandas as pd
import lightgbm as lgb

raw_data = pd.read_csv('old_dataset.csv')
raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
y = raw_data['price']
X = raw_data.drop(['price'], axis=1)

categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

preprocessing_pipeline = Pipeline([
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('cat', OrdinalEncoder(categories=[
                ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],  # cut
                ['J', 'I', 'H', 'G', 'F', 'E', 'D'],                # color
                ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']  # clarity
            ]), categorical_cols),
            ('num', MinMaxScaler(), numerical_cols)
        ]
    ))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

os.makedirs('models', exist_ok=True)

poly_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('poly', PolynomialFeatures(degree=2)),
    ('lin_reg', LinearRegression())
])
poly_pipeline.fit(X_train, y_train)
with open(f'models\\PolynomialFeatures.pkl','wb') as f:
      pickle.dump(poly_pipeline, f)

dt_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('dt', BaggingRegressor(n_estimators=10,max_samples=0.8, random_state=42,n_jobs=-1))
])
dt_pipeline.fit(X_train, y_train)
with open(f'models\\BaggingRegressor.pkl','wb') as f:
      pickle.dump(dt_pipeline, f)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

dt_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('dt', GradientBoostingRegressor(n_estimators=20, learning_rate=0.1, max_depth=3, random_state=42))
])
dt_pipeline.fit(X_train, y_train)
with open(f'models\\GradientBoostingRegressor.pkl','wb') as f:
      pickle.dump(dt_pipeline, f)

poly_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('stacking_reg', StackingRegressor(
        estimators=[
            ('dt', DecisionTreeRegressor(max_depth=14, min_samples_split=7, min_samples_leaf=7, random_state=42)),
            ('lr', LinearRegression())
        ],
        final_estimator=LinearRegression()
    ))
])
    
poly_pipeline.fit(X_train, y_train)
with open(f'models\\StackingRegressor.pkl','wb') as f:
      pickle.dump(poly_pipeline, f)


poly_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('lgbm', lgb.LGBMRegressor(n_estimators=100))
])

poly_pipeline.fit(X_train, y_train)

with open('models\\lightgbm.pkl', 'wb') as f:
    pickle.dump(poly_pipeline, f)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1146
[LightGBM] [Info] Number of data points in the train set: 8630, number of used features: 9
[LightGBM] [Info] Start training from score 3930.343337
