In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [28]:
train_data = pd.read_csv('/content/Housing-project-train-data.txt')
test_data = pd.read_csv('/content/Hosuing-project-test-data.txt')


In [29]:
print(train_data.head())

    Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  127         120       RL          NaN     4928   Pave   NaN      IR1   
1  889          20       RL         95.0    15865   Pave   NaN      IR1   
2  793          60       RL         92.0     9920   Pave   NaN      IR1   
3  110          20       RL        105.0    11751   Pave   NaN      IR1   
4  422          20       RL          NaN    16635   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
2         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
3         Lvl    AllPub  ...        0    NaN  MnPrv         NaN       0   
4         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2007        WD         Normal    

In [30]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1168 non-null   int64  
 1   MSSubClass     1168 non-null   int64  
 2   MSZoning       1168 non-null   object 
 3   LotFrontage    954 non-null    float64
 4   LotArea        1168 non-null   int64  
 5   Street         1168 non-null   object 
 6   Alley          77 non-null     object 
 7   LotShape       1168 non-null   object 
 8   LandContour    1168 non-null   object 
 9   Utilities      1168 non-null   object 
 10  LotConfig      1168 non-null   object 
 11  LandSlope      1168 non-null   object 
 12  Neighborhood   1168 non-null   object 
 13  Condition1     1168 non-null   object 
 14  Condition2     1168 non-null   object 
 15  BldgType       1168 non-null   object 
 16  HouseStyle     1168 non-null   object 
 17  OverallQual    1168 non-null   int64  
 18  OverallC

In [31]:
print(train_data.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1168.000000  1168.000000    954.00000    1168.000000  1168.000000   
mean    724.136130    56.767979     70.98847   10484.749144     6.104452   
std     416.159877    41.940650     24.82875    8957.442311     1.390153   
min       1.000000    20.000000     21.00000    1300.000000     1.000000   
25%     360.500000    20.000000     60.00000    7621.500000     5.000000   
50%     714.500000    50.000000     70.00000    9522.500000     6.000000   
75%    1079.500000    70.000000     80.00000   11515.500000     7.000000   
max    1460.000000   190.000000    313.00000  164660.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1168.000000  1168.000000   1168.000000  1161.000000  1168.000000  ...   
mean      5.595890  1970.930651   1984.758562   102.310078   444.726027  ...   
std       1.124343    30.145255     20.785185   182.595606   462.664785  ..

In [34]:
combined_data = pd.concat([train_data.drop(columns=['SalePrice']), test_data])


In [35]:
numerical_cols = combined_data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = combined_data.select_dtypes(include='object').columns.tolist()


In [36]:
combined_data[numerical_cols] = combined_data[numerical_cols].fillna(combined_data[numerical_cols].mean())
combined_data[categorical_cols] = combined_data[categorical_cols].fillna(combined_data[categorical_cols].mode().iloc[0])


In [37]:
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])


In [38]:
X_train = combined_data.iloc[:len(train_data), :]
X_test = combined_data.iloc[len(train_data):, :]
y_train = train_data['SalePrice']


In [39]:
cols_of_interest = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
                    'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                    'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                    'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
                    'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
                    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                    'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
                    'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
                    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
                    'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                    'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
                    'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
                    'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence',
                    'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

# Example of accessing data from the processed datasets
X_train_subset = X_train[cols_of_interest]
X_test_subset = X_test[cols_of_interest]

In [40]:
# Display the first few rows to verify correctness
print(X_train_subset.head())
print(X_test_subset.head())

    Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0  127         120         3    70.049958     4928       1      0         0   
1  889          20         3    95.000000    15865       1      0         0   
2  793          60         3    92.000000     9920       1      0         0   
3  110          20         3   105.000000    11751       1      0         0   
4  422          20         3    70.049958    16635       1      0         0   

   LandContour  Utilities  ...  ScreenPorch  PoolArea  PoolQC  Fence  \
0            3          0  ...            0         0       2      2   
1            3          0  ...          224         0       2      2   
2            3          0  ...            0         0       2      2   
3            3          0  ...            0         0       2      2   
4            3          0  ...            0         0       2      2   

   MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0            2        0  