In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission_df = pd.read_csv('data/sample_submission.csv')

# Display the first few rows of the training data
print(train_df.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [4]:
missing_values = train_df.isnull().sum().sort_values(ascending=False)
missing_values[missing_values > 0]

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [5]:
# Drop columns with a large number of missing values
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
train_df = train_df.drop(columns=cols_to_drop)

In [6]:
# Impute missing values
# For numerical columns, use the median
num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].median())

# For categorical columns, use the mode
cat_cols = train_df.select_dtypes(include=['object']).columns
train_df[cat_cols] = train_df[cat_cols].fillna(train_df[cat_cols].mode().iloc[0])

In [7]:
train_df.isnull().sum().sum()

0

In [8]:
# Separate features and target variable
X_train = train_df.drop('SalePrice', axis=1)
y_train = train_df['SalePrice']

# Check the processed training data
print(X_train.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour  \
0   1          60       RL         65.0     8450   Pave      Reg         Lvl   
1   2          20       RL         80.0     9600   Pave      Reg         Lvl   
2   3          60       RL         68.0    11250   Pave      IR1         Lvl   
3   4          70       RL         60.0     9550   Pave      IR1         Lvl   
4   5          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig  ... OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch  \
0    AllPub    Inside  ...          61             0         0           0   
1    AllPub       FR2  ...           0             0         0           0   
2    AllPub    Inside  ...          42             0         0           0   
3    AllPub    Corner  ...          35           272         0           0   
4    AllPub       FR2  ...          84             0         0           0   

  PoolArea MiscVal  MoSold  YrSold  SaleType  Sale

In [9]:
# Select a subset of features for simplicity
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
X_train = X_train[selected_features]

# Check the selected features
print(X_train.head())


   OverallQual  GrLivArea  GarageCars  TotalBsmtSF  FullBath  YearBuilt
0            7       1710           2          856         2       2003
1            6       1262           2         1262         2       1976
2            7       1786           2          920         2       2001
3            7       1717           3          756         1       1915
4            8       2198           3         1145         2       2000


In [10]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Check the model coefficients
print(model.coef_)


[20519.96877576    54.07582887 14646.3158134     29.18210823
 -5503.97370911   336.54674956]


In [11]:
from sklearn.metrics import mean_squared_error

# Predict on the training data
y_train_pred = model.predict(X_train)

# Calculate the mean squared error
mse = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 1459092344.1501384


In [17]:
# Preprocess the test dataset
num_cols = test_df.select_dtypes(include=['float64', 'int64']).columns
test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].median())

cat_cols = test_df.select_dtypes(include=['object']).columns
test_df[cat_cols] = test_df[cat_cols].fillna(test_df[cat_cols].mode())

# Ensure the test data has the same columns as the training data
missing_cols = set(X_train.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[X_train.columns]

# Make predictions on the test data
test_predictions = model.predict(test_df)

# Prepare the submission file
submission_df = sample_submission_df.copy()
submission_df['SalePrice'] = test_predictions

#  Save the submission file
submission_df.to_csv('data/submission.csv', index=False)
print('Submission file saved.')


Submission file saved.
