In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

In [16]:
file_path = "kc_house_data.csv"  
target_column = "price"

Step1:Extracting Data

In [17]:
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist.")
data = pd.read_csv(file_path)
print("Data extracted successfully!")

Data extracted successfully!


In [8]:
print("Dataset Details:")
print("Shape of the dataset:", data.shape)
print("\nFirst 5 rows:\n", data.head())
print("\nColumn Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe(include='all'))

Dataset Details:
Shape of the dataset: (21613, 21)

First 5 rows:
            id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680   

Step2: Preprocessing Data

In [9]:
# Drop unnecessary columns (e.g., ID, Date)
if 'id' in data.columns:
    data.drop(columns=['id'], inplace=True)
if 'date' in data.columns:
    data.drop(columns=['date'], inplace=True)

# Check and Handle Missing Values
print("Missing Values:")
missing = data.isnull().sum()
print(missing[missing > 0] if missing.any() else "No missing values!")
data.fillna(data.median(numeric_only=True), inplace=True)

Missing Values:
No missing values!


In [10]:
# Encode Categorical Variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

print("\nPreprocessed Dataset Details:")
print("Shape of the dataset after preprocessing:", data.shape)
print("\nFirst 5 rows after preprocessing:\n", data.head())


Preprocessed Dataset Details:
Shape of the dataset after preprocessing: (21613, 19)

First 5 rows after preprocessing:
       price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0  221900.0         3       1.00         1180      5650     1.0           0   
1  538000.0         3       2.25         2570      7242     2.0           0   
2  180000.0         2       1.00          770     10000     1.0           0   
3  604000.0         4       3.00         1960      5000     1.0           0   
4  510000.0         3       2.00         1680      8080     1.0           0   

   view  condition  grade  sqft_above  sqft_basement  yr_built  yr_renovated  \
0     0          3      7        1180              0      1955             0   
1     0          3      7        2170            400      1951          1991   
2     0          3      6         770              0      1933             0   
3     0          5      7        1050            910      1965             0   
4   

Step3:Transform Data

In [11]:
# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Normalize Numeric Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining Set Shape: {X_train_scaled.shape}, Testing Set Shape: {X_test_scaled.shape}")


Training Set Shape: (17290, 18), Testing Set Shape: (4323, 18)


Step4:Load Data

In [13]:
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

pd.DataFrame(X_train_scaled).to_csv(os.path.join(output_dir, "X_train.csv"), index=False)
pd.DataFrame(X_test_scaled).to_csv(os.path.join(output_dir, "X_test.csv"), index=False)
pd.DataFrame(y_train).to_csv(os.path.join(output_dir, "y_train.csv"), index=False)
pd.DataFrame(y_test).to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

In [14]:
print(f"\nData successfully saved in the '{output_dir}' directory:")
print("- X_train.csv")
print("- X_test.csv")
print("- y_train.csv")
print("- y_test.csv")


Data successfully saved in the 'output' directory:
- X_train.csv
- X_test.csv
- y_train.csv
- y_test.csv
