In [8]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("puf2022_modified.csv")

df.describe


<bound method NDFrame.describe of         PRICE   WEIGHT    SQFT  FINALDEST  LEASE  BEDROOMS  STATUS  REGION  \
0     90000.0  69.1571  1200.0        1.0    2.0       3.0     2.0     3.0   
1     82000.0  69.1571  1200.0        1.0    2.0       3.0     2.0     3.0   
2     68000.0  69.1571  1100.0        1.0    2.0       3.0     2.0     3.0   
3     90000.0  18.5789  1200.0        1.0    2.0       3.0     2.0     4.0   
4     90000.0  18.5789  1200.0        1.0    2.0       3.0     2.0     4.0   
...       ...      ...     ...        ...    ...       ...     ...     ...   
4525      9.0   8.0368     9.0        9.0    9.0       9.0     3.0     5.0   
4526      9.0  21.2679     9.0        9.0    9.0       9.0     3.0     5.0   
4527      9.0  21.2679     9.0        9.0    9.0       9.0     3.0     5.0   
4528      9.0   5.9000     9.0        9.0    9.0       9.0     3.0     5.0   
4529      9.0   5.9000     9.0        9.0    9.0       9.0     3.0     5.0   

      TITLED  SECURED  LOCATI

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Split the data into predictors (X) and target (y)
X = df.drop(columns=['PRICE'])  # Features
y = df['PRICE']  # Target variable

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler only on the predictors
scaler.fit(X)

# Scale the predictors
X_scaled = scaler.transform(X)

# Combine the scaled predictors and the unscaled target variable
scaled_data = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

# Print the scaled data
print(f'The scaled data is:\n{scaled_data}')


The scaled data is:
        WEIGHT      SQFT  FINALDEST  LEASE  BEDROOMS  STATUS  REGION  TITLED  \
0     0.795697  0.385312        0.0  0.125      0.25     0.5    0.50   0.125   
1     0.795697  0.385312        0.0  0.125      0.25     0.5    0.50   0.125   
2     0.795697  0.352960        0.0  0.125      0.25     0.5    0.50   0.125   
3     0.205224  0.385312        0.0  0.125      0.25     0.5    0.75   0.000   
4     0.205224  0.385312        0.0  0.125      0.25     0.5    0.75   0.125   
...        ...       ...        ...    ...       ...     ...     ...     ...   
4525  0.082151  0.000000        1.0  1.000      1.00     1.0    1.00   1.000   
4526  0.236617  0.000000        1.0  1.000      1.00     1.0    1.00   1.000   
4527  0.236617  0.000000        1.0  1.000      1.00     1.0    1.00   1.000   
4528  0.057205  0.000000        1.0  1.000      1.00     1.0    1.00   1.000   
4529  0.057205  0.000000        1.0  1.000      1.00     1.0    1.00   1.000   

       SECURED  LOC

In [10]:
from sklearn.preprocessing import RobustScaler

# Split the data into predictors (X) and target (y)
X = df.drop(columns=['PRICE'])
y = df['PRICE']

# Create a new RobustScaler object
scaler = RobustScaler()

# Scale the predictors
X_scaled = scaler.fit_transform(X)

# Convert the scaled predictors to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Calculate the first and third quartiles
Q1 = X_scaled_df.quantile(0.25)
Q3 = X_scaled_df.quantile(0.75)

# Calculate the interquartile range
IQR = Q3 - Q1

# Define the lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove the outliers
X_no_outliers = X_scaled_df[~((X_scaled_df < lower_bound) | (X_scaled_df > upper_bound)).any(axis=1)]

# Combine the predictors and target variable
data_no_outliers = pd.concat([X_no_outliers, y], axis=1)

# Print the data without outliers
print(f'The data without outliers is:\n{data_no_outliers}')


The data without outliers is:
        WEIGHT      SQFT  FINALDEST  LEASE  BEDROOMS  STATUS  REGION  \
3     0.263433 -0.333333        0.0    0.0       0.0     0.0     1.0   
4     0.263433 -0.333333        0.0    0.0       0.0     0.0     1.0   
5     0.263433 -0.800000        0.0    0.0       0.0     0.0     1.0   
6     0.263433 -0.500000        0.0    0.0       0.0     0.0     1.0   
9     0.263433 -0.500000        0.0    0.0       0.0     0.0     1.0   
...        ...       ...        ...    ...       ...     ...     ...   
4525       NaN       NaN        NaN    NaN       NaN     NaN     NaN   
4526       NaN       NaN        NaN    NaN       NaN     NaN     NaN   
4527       NaN       NaN        NaN    NaN       NaN     NaN     NaN   
4528       NaN       NaN        NaN    NaN       NaN     NaN     NaN   
4529       NaN       NaN        NaN    NaN       NaN     NaN     NaN   

        TITLED  SECURED  LOCATION  PIERS    PRICE  
3    -0.142857    0.000    -0.125   0.00  90000.0  
4

In [11]:
# Save the modified DataFrame back to the CSV file
df.to_csv("puf2022_MinMaxScaler.csv", index=False)  # Don't save the index