In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
df = pd.read_csv('house price.csv')

In [13]:
print("Original dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nDescriptive statistics:")
print(df.describe())

Original dataset shape: (506, 14)

First 5 rows:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS 

In [14]:
# Check for missing values
if df.isnull().sum().sum() > 0:
    print(f"\nHandling {df.isnull().sum().sum()} missing values...")
else:
    print("\nNo missing values found in the dataset.")


No missing values found in the dataset.


In [15]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [16]:
# Remove target variable from numerical columns
target_col = 'MEDV'
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")
print(f"Target column: {target_col}")


Categorical columns: []
Numerical columns: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Target column: MEDV


In [17]:
# Check unique values in categorical columns
for col in categorical_cols:
    print(f"\nUnique values in {col}: {df[col].unique()}")

In [18]:
# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [19]:

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [20]:
# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape: (506, 13)
Target shape: (506,)


In [29]:
# Preprocess the features
print("\nPreprocessing features...")
X_processed = preprocessor.fit_transform(X)


Preprocessing features...


In [30]:
# Get feature names after preprocessing - FIXED APPROACH
# First, fit the preprocessor to get the feature names
preprocessor.fit(X)

In [31]:
# Now we can safely access the fitted transformers
numerical_features = numerical_cols

In [32]:
# Get categorical feature names from the fitted onehot encoder
if categorical_cols:
    categorical_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    all_features = list(numerical_features) + list(categorical_features)
else:
    all_features = numerical_features

In [33]:

# Convert back to DataFrame for better readability
X_processed_df = pd.DataFrame(X_processed, columns=all_features)

In [34]:
print(f"\nProcessed features shape: {X_processed_df.shape}")
print("\nFirst 5 rows of processed features:")
print(X_processed_df.head())


Processed features shape: (506, 13)

First 5 rows of processed features:
       CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
0 -0.419782  0.284830 -1.287909 -0.272599 -0.144217  0.413672 -0.120013   
1 -0.417339 -0.487722 -0.593381 -0.272599 -0.740262  0.194274  0.367166   
2 -0.417342 -0.487722 -0.593381 -0.272599 -0.740262  1.282714 -0.265812   
3 -0.416750 -0.487722 -1.306878 -0.272599 -0.835284  1.016303 -0.809889   
4 -0.412482 -0.487722 -1.306878 -0.272599 -0.835284  1.228577 -0.511180   

        DIS       RAD       TAX   PTRATIO         B     LSTAT  
0  0.140214 -0.982843 -0.666608 -1.459000  0.441052 -1.075562  
1  0.557160 -0.867883 -0.987329 -0.303094  0.441052 -0.492439  
2  0.557160 -0.867883 -0.987329 -0.303094  0.396427 -1.208727  
3  1.077737 -0.752922 -1.106115  0.113032  0.416163 -1.361517  
4  1.077737 -0.752922 -1.106115  0.113032  0.441052 -1.026501  


In [35]:

# Split the dataset into training and testing sets
print("\nSplitting dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42
)


Splitting dataset into training and testing sets...


In [36]:

print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing set shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training set shape: X_train=(404, 13), y_train=(404,)
Testing set shape: X_test=(102, 13), y_test=(102,)


In [37]:
# Check target variable distribution
print(f"\nTarget variable statistics:")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")
print(f"Min: {y.min():.2f}")
print(f"Max: {y.max():.2f}")


Target variable statistics:
Mean: 22.53
Std: 9.20
Min: 5.00
Max: 50.00


In [38]:
# Save the preprocessed data
X_train.to_csv('X_train_house.csv', index=False)
X_test.to_csv('X_test_house.csv', index=False)
pd.DataFrame(y_train, columns=[target_col]).to_csv('y_train_house.csv', index=False)
pd.DataFrame(y_test, columns=[target_col]).to_csv('y_test_house.csv', index=False)

In [39]:
print("\nPreprocessed data saved to:")
print("- X_train_house.csv")
print("- X_test_house.csv") 
print("- y_train_house.csv")
print("- y_test_house.csv")


Preprocessed data saved to:
- X_train_house.csv
- X_test_house.csv
- y_train_house.csv
- y_test_house.csv


In [40]:
# Display final preprocessed data samples
print("\nFinal preprocessed training data sample:")
print("Features:")
print(X_train.head())
print("\nTarget:")
print(y_train.head())


Final preprocessed training data sample:
Features:
         CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
477  1.327804 -0.487722  1.015999 -0.272599  0.512296 -1.397069  1.021481   
15  -0.347506 -0.487722 -0.437258 -0.272599 -0.144217 -0.642000 -0.429390   
332 -0.416484  1.014463 -0.740749 -0.272599 -1.008914 -0.361342 -1.610001   
423  0.399963 -0.487722  1.015999 -0.272599  0.512296 -0.258767  0.587642   
19  -0.336054 -0.487722 -0.437258 -0.272599 -0.144217 -0.794439  0.032897   

          DIS       RAD       TAX   PTRATIO         B     LSTAT  
477 -0.805438  1.661245  1.530926  0.806576 -0.078878  1.718101  
15   0.334449 -0.637962 -0.601276  1.176466  0.427018 -0.586356  
332  1.352738 -0.982843 -0.619094 -0.719220  0.061137 -0.676067  
423 -0.842945  1.661245  1.530926  0.806576 -3.883072  1.491020  
19   0.000693 -0.637962 -0.601276  1.176466  0.375814 -0.192467  

Target:
477    12.0
15     19.9
332    19.4
423    13.4
19     18.2
Name: MEDV, dtype: f

In [41]:
# Additional analysis for the housing dataset
print("\n" + "="*50)
print("ADDITIONAL HOUSING DATA ANALYSIS")
print("="*50)


ADDITIONAL HOUSING DATA ANALYSIS


In [42]:
# Check for outliers in the target variable
print(f"\nTarget variable (MEDV) analysis:")
q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr


Target variable (MEDV) analysis:


In [43]:
outliers = y[(y < lower_bound) | (y > upper_bound)]
print(f"Potential outliers in MEDV: {len(outliers)} ({len(outliers)/len(y)*100:.1f}%)")

Potential outliers in MEDV: 40 (7.9%)


In [44]:
# Correlation analysis
correlation_with_target = df[numerical_cols + [target_col]].corr()[target_col].sort_values(ascending=False)
print(f"\nTop features correlated with {target_col}:")
print(correlation_with_target.head(10))


Top features correlated with MEDV:
MEDV    1.000000
RM      0.695360
ZN      0.360445
B       0.333461
DIS     0.249929
CHAS    0.175260
AGE    -0.376955
RAD    -0.381626
CRIM   -0.388305
NOX    -0.427321
Name: MEDV, dtype: float64


In [45]:
print(f"\nBottom features correlated with {target_col}:")
print(correlation_with_target.tail(10))


Bottom features correlated with MEDV:
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: MEDV, dtype: float64
