In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")

y = train["critical_temp"]
X = train.drop(columns=["critical_temp"])

# считаем асимметрию распределений
skew = X.skew(numeric_only=True)

high_skew = skew[skew.abs() > 1].index.tolist()
mid_skew  = skew[(skew.abs() <= 1) & (skew.abs() > 0.3)].index.tolist()
low_skew  = skew[skew.abs() <= 0.3].index.tolist()

print("Сильно скошенных:", len(high_skew))
print("Умеренно скошенных:", len(mid_skew))
print("Почти симметричных:", len(low_skew))


Сильно скошенных: 30
Умеренно скошенных: 38
Почти симметричных: 13


In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression  # или LinearRegression / Lasso / ElasticNet

# трансформер для сильно скошенных: лог + RobustScaler
log_transformer = Pipeline(steps=[
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", RobustScaler())
])

# трансформер для умеренно скошенных: только RobustScaler
robust_transformer = Pipeline(steps=[
    ("scaler", RobustScaler())
])

# трансформер для почти нормальных: StandardScaler
standard_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("high_skew", log_transformer, high_skew),
        ("mid_skew", robust_transformer, mid_skew),
        ("low_skew", standard_transformer, low_skew),
    ],
    remainder="drop"  # или "passthrough", если хочешь оставить нетронутые колонки
)


In [None]:


model = LinearRegression()

pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

pipe.fit(X, y)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('high_skew', ...), ('mid_skew', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [4]:
test = pd.read_csv("test.csv")
X_test = test  # если в test нет таргета

y_pred = pipe.predict(X_test)

In [5]:
submit_index = range(len(X_test))
submission = pd.DataFrame({
    "index": submit_index,
    "critical_temp": y_pred
})

submission = submission[["index", "critical_temp"]]
submission.to_csv("submission.csv", index=False)
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4253 entries, 0 to 4252
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          4253 non-null   int64  
 1   critical_temp  4253 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 66.6 KB
