In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ===========================================================
# 1️⃣ LOAD DATA
# ===========================================================
df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")

print("Train shape:", df_train.shape)
print("Test shape:",  df_test.shape)

Train shape: (181507, 279)
Test shape: (77789, 278)


In [10]:
# Separate target
target = "price_doc"
y = df_train[target]
df_train.drop(columns=[target], inplace=True)

# Combine for consistent cleaning
full = pd.concat([df_train, df_test], axis=0, ignore_index=True)

print("\nCombined shape:", full.shape)


Combined shape: (259296, 278)


In [11]:
# Columns where negative values make NO sense
non_negative_cols = [
    c for c in full.columns 
    if ("sq" in c or 
        "count" in c or 
        "km" in c or 
        "part" in c or 
        "density" in c or
        "all" in c or 
        "efficiency" in c)
]

for col in non_negative_cols:
    # find negative rows
    neg_mask = full[col] < 0
    
    if neg_mask.any():
        mean_val = full.loc[full[col] >= 0, col].mean()  # mean of valid values
        full.loc[neg_mask, col] = mean_val               # replace negatives

print("\nNegative values replaced with column mean.")


Negative values replaced with column mean.


In [12]:
# ===========================================================
# 3️⃣ OUTLIER HANDLING (Winsorizing at 1st–99th percentile)
# ===========================================================

num_cols = full.select_dtypes(include=['number']).columns

for col in num_cols:
    low, high = full[col].quantile([0.01, 0.99])
    full[col] = full[col].clip(lower=low, upper=high)

print("\nOutliers winsorized (1%–99%).")



Outliers winsorized (1%–99%).


In [13]:
# ===========================================================
# 4️⃣ MISSING VALUE HANDLING  
#     - Numeric → median  
#     - Categorical → mode
# ===========================================================

num_cols = full.select_dtypes(include=['number']).columns
cat_cols = full.select_dtypes(include=['object','category']).columns

# Numeric
for col in num_cols:
    full[col].fillna(full[col].median(), inplace=True)

# Categorical
for col in cat_cols:
    full[col].fillna(full[col].mode()[0], inplace=True)

print("\nMissing values imputed (median/mode).")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full[col].fillna(full[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full[col].fillna(full[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin


Missing values imputed (median/mode).


In [14]:
# ===========================================================
# 5️⃣ ENCODE CATEGORICAL FEATURES
#     - Binary → LabelEncoder
#     - Low-cardinality (<=10) → One-Hot
#     - High-cardinality → Frequency encoding
# ===========================================================

# Extract again (after imputation)
cat_cols = full.select_dtypes(include=['object','category']).columns

low_card = [c for c in cat_cols if full[c].nunique() <= 10]
high_card = [c for c in cat_cols if full[c].nunique() > 10]


# 🔹 5A. LABEL ENCODE BINARY COLUMNS
binary_cols = [c for c in low_card if full[c].nunique() == 2]
le = LabelEncoder()
for col in binary_cols:
    full[col] = le.fit_transform(full[col])


# 🔹 5B. ONE-HOT ENCODE LOW-CARD COLUMNS
one_hot_cols = [c for c in low_card if full[c].nunique() > 2]
full = pd.get_dummies(full, columns=one_hot_cols, drop_first=True)


# 🔹 5C. FREQUENCY ENCODE HIGH-CARD COLUMNS
for col in high_card:
    freq = full[col].value_counts() / len(full)
    full[col] = full[col].map(freq)

print("\nCategorical encoding complete.")



Categorical encoding complete.


In [15]:
# ===========================================================
# 6️⃣ SPLIT BACK INTO TRAIN/TEST
# ===========================================================

train = full.iloc[: len(df_train), :]
test  = full.iloc[len(df_train):, :]

# Add back target
train[target] = y.values
print("\nCleaned train shape:", train.shape)
print("Cleaned test shape:",  test.shape)


Cleaned train shape: (181507, 282)
Cleaned test shape: (77789, 281)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[target] = y.values


In [17]:
neg_counts = (train < 0).sum().sort_values(ascending=False)
print("Negative values (train):\n", neg_counts.head())

neg_counts_test = (test < 0).sum().sort_values(ascending=False)
print("\nNegative values (test):\n", neg_counts_test.head())


Negative values (train):
 culture_objects_top_25_raion             71974
school_education_centers_top_20_raion    69309
university_top_20_raion                  67358
healthcare_centers_raion                 29023
life_full_ratio                          25428
dtype: int64

Negative values (test):
 culture_objects_top_25_raion             30966
school_education_centers_top_20_raion    29740
university_top_20_raion                  28922
healthcare_centers_raion                 12518
life_full_ratio                          10966
dtype: int64


In [19]:
neg_cols = [c for c in full.columns if (full[c] < 0).any()]
neg_cols


['preschool_education_centers_raion',
 'school_education_centers_raion',
 'school_education_centers_top_20_raion',
 'healthcare_centers_raion',
 'university_top_20_raion',
 'sport_objects_raion',
 'additional_education_raion',
 'culture_objects_top_25_raion',
 'shopping_centers_raion',
 'office_raion',
 'male_f',
 'female_f',
 '16_29_male',
 '16_29_female',
 'metro_min_walk',
 'public_transport_station_min_walk',
 'life_full_ratio']

In [20]:
import numpy as np

# List of problematic columns
cols_negative = [
    'preschool_education_centers_raion',
    'school_education_centers_raion',
    'school_education_centers_top_20_raion',
    'healthcare_centers_raion',
    'university_top_20_raion',
    'sport_objects_raion',
    'additional_education_raion',
    'culture_objects_top_25_raion',
    'shopping_centers_raion',
    'office_raion',
    'male_f',
    'female_f',
    '16_29_male',
    '16_29_female',
    'metro_min_walk',
    'public_transport_station_min_walk',
    'life_full_ratio'
]

# Replace negatives
for col in cols_negative:
    if 'min_walk' in col:  # distances / time
        median_val = train[col][train[col] >= 0].median()
        train[col] = train[col].apply(lambda x: median_val if x < 0 else x)
        test[col] = test[col].apply(lambda x: median_val if x < 0 else x)
    elif 'ratio' in col:  # ratios
        train[col] = train[col].apply(lambda x: np.nan if x < 0 else x)
        test[col] = test[col].apply(lambda x: np.nan if x < 0 else x)
    else:  # counts of objects or population
        train[col] = train[col].apply(lambda x: 0 if x < 0 else x)
        test[col] = test[col].apply(lambda x: 0 if x < 0 else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].apply(lambda x: 0 if x < 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].apply(lambda x: 0 if x < 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].apply(lambda x: median_val if x < 0 else x)
A value is trying to 

In [21]:
# Fill NaNs with median (safe for numeric features)
for col in train.columns:
    if train[col].isna().sum() > 0:
        median_val = train[col].median()
        train[col].fillna(median_val, inplace=True)
        test[col].fillna(median_val, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(median_val, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instea