In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ====================================
# 1. LOAD DATA
# ====================================
print("=" * 60)
print("LOADING DATA")
print("=" * 60)

df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(f"\nTrain shape: {df.shape}")
print(f"Test shape: {test.shape}")

LOADING DATA

Train shape: (181507, 279)
Test shape: (77789, 278)


In [22]:
df.head()

Unnamed: 0,id,full_sq,life_sq,floor,product_type,sub_area,green_zone_part,indust_part,children_preschool,preschool_education_centers_raion,...,trc_sqm_5000_log,sport_count_5000_log,life_full_ratio,cafe_density_5000,high_floor,large_apartment,rooms_inferred,living_efficiency,price_doc,price_doc_log
0,106299,-0.280658,50.127224,10.970619,Type_A,Area_380,0.099506,0.222095,6255.829172,4.076307,...,14.688605,4.046719,-178.606113,16.135148,0,0,3.0,69.684824,6.452158,2.008504
1,125559,2129.178675,3438.561939,6.282464,Type_B,Area_1375,0.588042,0.157862,7391.899017,7.608949,...,15.293149,3.805732,1.614971,166.763131,0,1,229.0,1.614213,101.661749,4.63144
2,204969,44.255548,-15.662341,0.894701,Type_A,Area_272,0.074837,0.266754,5658.091711,4.054293,...,14.162732,3.987516,-0.353907,12.076017,0,0,1.0,-0.346087,6.257546,1.982042
3,248026,2622.821354,1373.058212,15.97126,Type_B,Area_1750,0.850289,0.393528,13262.734984,6.832598,...,14.881627,4.765929,0.523504,184.953625,1,1,91.0,0.523305,90.592523,4.51735
4,51881,52.877556,12.392606,2.876796,Type_A,Area_1773,0.07045,0.238344,3842.416286,5.030503,...,14.302936,4.114335,0.234364,14.630758,0,0,1.0,0.230014,8.486698,2.249891


In [3]:
# ====================================
# 2. IDENTIFY FEATURE TYPES
# ====================================
print("\n" + "=" * 60)
print("FEATURE TYPE IDENTIFICATION")
print("=" * 60)

# Separate features by type
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove ID and target from numerical
numerical_cols = [col for col in numerical_cols if col not in ['id', 'price_doc']]

print(f"\nNumerical features: {len(numerical_cols)}")
print(f"Categorical features: {len(categorical_cols)}")


FEATURE TYPE IDENTIFICATION

Numerical features: 262
Categorical features: 15


In [19]:
# ====================================
# 3. MISSING VALUE ANALYSIS
# ====================================
print("\n" + "=" * 60)
print("MISSING VALUE ANALYSIS")
print("=" * 60)

missing_train = df[numerical_cols + categorical_cols].isnull().sum()
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)

if len(missing_train) > 0:
    missing_pct = (missing_train / len(df) * 100)
    print(f"\nColumns with missing values: {len(missing_train)}")
    print("\nTop 10 features with most missing:")
    print(missing_pct.head(10))
    
    # Identify high missing columns (>50%)
    high_missing = missing_pct[missing_pct > 50].index.tolist()
    print(f"\nFeatures with >50% missing (will be dropped): {len(high_missing)}")
    
    # Drop high missing columns from both train and test
    numerical_cols = [col for col in numerical_cols if col not in high_missing]
    categorical_cols = [col for col in categorical_cols if col not in high_missing]


MISSING VALUE ANALYSIS

Columns with missing values: 6

Top 10 features with most missing:
office_sqm_5000_log     6.197557
trc_sqm_5000_log        3.975604
full_all_log            2.216443
sport_count_5000_log    1.295267
area_m_log              0.015426
raion_popul_log         0.007162
dtype: float64

Features with >50% missing (will be dropped): 0


In [None]:
# ====================================
# 3. MISSING VALUE ANALYSIS for test data
# ====================================
print("\n" + "=" * 60)
print("MISSING VALUE ANALYSIS for test data")
print("=" * 60)

missing_train = test.isnull().sum()
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)

if len(missing_train) > 0:
    missing_pct = (missing_train / len(test) * 100)
    print(f"\nColumns with missing values: {len(missing_train)}")
    print("\nTop 10 features with most missing:")
    print(missing_pct.head(10))
    
    # Identify high missing columns (>50%)
    high_missing = missing_pct[missing_pct > 50].index.tolist()
    print(f"\nFeatures with >50% missing (will be dropped): {len(high_missing)}")
    
    # Drop high missing columns from both train and test
    numerical_cols = [col for col in numerical_cols if col not in high_missing]
    categorical_cols = [col for col in categorical_cols if col not in high_missing]


MISSING VALUE ANALYSIS

Columns with missing values: 6

Top 10 features with most missing:
office_sqm_5000_log     6.318374
trc_sqm_5000_log        3.992852
full_all_log            2.256103
sport_count_5000_log    1.277816
area_m_log              0.026996
raion_popul_log         0.015426
dtype: float64

Features with >50% missing (will be dropped): 0


In [5]:
# ====================================
# 4. TARGET ANALYSIS & TRANSFORMATION
# ====================================
print("\n" + "=" * 60)
print("TARGET VARIABLE ANALYSIS")
print("=" * 60)

print("\nOriginal target statistics:")
print(df['price_doc'].describe())
print(f"Skewness: {df['price_doc'].skew():.2f}")

# Log transform target (common for skewed price data)
df['price_doc_log'] = np.log1p(df['price_doc'])
print(f"\nLog-transformed skewness: {df['price_doc_log'].skew():.2f}")


TARGET VARIABLE ANALYSIS

Original target statistics:
count    181507.000000
mean         14.845599
std          21.533138
min           0.392328
25%           5.303449
50%           7.186257
75%          11.781645
max         109.864990
Name: price_doc, dtype: float64
Skewness: 2.79

Log-transformed skewness: 1.07


In [23]:
# ====================================
# 5. CLEAN FEATURE LISTS AFTER MISSING VALUE ANALYSIS
# ====================================

# Ensure id and target are removed from features
drop_cols = ['id', 'price_doc', 'price_doc_log']

numerical_cols = [col for col in numerical_cols if col not in drop_cols]
categorical_cols = [col for col in categorical_cols if col not in drop_cols]

print("\nFinal numerical feature count:", len(numerical_cols))
print("Final categorical feature count:", len(categorical_cols))


# ====================================
# 6. BUILD PREPROCESSING PIPELINE
# ====================================
print("\n" + "=" * 60)
print("BUILDING PREPROCESSING PIPELINE")
print("=" * 60)

# Numerical pipeline → median imputation + scaling
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline → fill missing with 'missing' + OneHot
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# ColumnTransformer: apply proper preprocessing
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
], remainder='drop')

print("\nPreprocessor ready!")



Final numerical feature count: 262
Final categorical feature count: 15

BUILDING PREPROCESSING PIPELINE

Preprocessor ready!


In [24]:
# Check missing after preprocessing
X = df[numerical_cols + categorical_cols]

Xt = preprocessor.fit_transform(X)

print("Missing values after preprocessing:", np.isnan(Xt).sum())


Missing values after preprocessing: 0


In [28]:
# ====================================
# 5. TRAIN–VALIDATION SPLIT
# ====================================
print("\n" + "=" * 60)
print("CREATING TRAIN-VALIDATION SPLIT")
print("=" * 60)

RANDOM_STATE = 29325   # Your ERP ID

# Construct final X and y
X = df[numerical_cols + categorical_cols].copy()
y = df['price_doc_log'].copy()

# Test set features must have same columns
test_X = test[numerical_cols + categorical_cols].copy()

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.30,
    random_state=RANDOM_STATE
)

print(f"\nTraining set:   {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set:       {test_X.shape}")



CREATING TRAIN-VALIDATION SPLIT

Training set:   (127054, 277)
Validation set: (54453, 277)
Test set:       (77789, 277)


In [29]:
# ====================================
# 7. BASELINE: LINEAR REGRESSION
# ====================================
print("\n" + "=" * 60)
print("BASELINE: LINEAR REGRESSION")
print("=" * 60)

baseline_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

baseline_pipeline.fit(X_train, y_train)

# Predict on validation set
y_val_pred = baseline_pipeline.predict(X_val)

# Metrics on log scale
val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)

print(f"\nValidation RMSE (log scale): {val_rmse_log:.4f}")
print(f"Validation R²: {val_r2:.4f}")

# Convert back to original price scale
y_val_orig = np.expm1(y_val)
y_val_pred_orig = np.expm1(y_val_pred)

rmse_original = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
print(f"Validation RMSE (original scale): {rmse_original:.2f}")



BASELINE: LINEAR REGRESSION

Validation RMSE (log scale): 0.5262
Validation R²: 0.6187
Validation RMSE (original scale): 14.66


In [30]:
# ====================================
# 8. GENERATE KAGGLE SUBMISSION FILE
# ====================================
print("\n" + "=" * 60)
print("CREATING KAGGLE SUBMISSION FILE")
print("=" * 60)

# Predict on test data (log scale)
test_pred_log = baseline_pipeline.predict(test_X)

# Convert predictions back to original scale
test_pred = np.expm1(test_pred_log)

# Build submission dataframe
submission = pd.DataFrame({
    "id": test["id"],
    "price_doc": test_pred
})

# Save CSV
submission.to_csv("submission.csv", index=False)

print("\nKaggle submission file saved as 'submission.csv'!")
print(submission.head())



CREATING KAGGLE SUBMISSION FILE

Kaggle submission file saved as 'submission.csv'!
       id  price_doc
0  243467   7.229162
1  230180  12.125534
2  256036   3.071674
3    1848   3.424784
4   68720  11.667755
