In [2]:
import seaborn as sns
import numpy as np
import pandas as pd


In [4]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [5]:
train_features = train_data[['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']]
train_target = train_data['SalePrice']

In [6]:
train_features['TotalBathrooms'] = (
    train_features['FullBath'] +
    (train_features['HalfBath'] * 0.5) +
    train_features['BsmtFullBath'] +
    (train_features['BsmtHalfBath'] * 0.5)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_features['TotalBathrooms'] = (


In [7]:
test_features = test_data[['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']]
test_features['TotalBathrooms'] = (
    test_features['FullBath'] +
    (test_features['HalfBath'] * 0.5) +
    test_features['BsmtFullBath'] +
    (test_features['BsmtHalfBath'] * 0.5)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features['TotalBathrooms'] = (


In [8]:
train_features = train_features[['GrLivArea', 'BedroomAbvGr', 'TotalBathrooms']]
test_features = test_features[['GrLivArea', 'BedroomAbvGr', 'TotalBathrooms']]

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
model = LinearRegression()
model.fit(train_features, train_target)

In [11]:
train_predictions = model.predict(train_features)
mse = mean_squared_error(train_target, train_predictions)
r2 = r2_score(train_target, train_predictions)

In [12]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 2435642603.2188134
R-squared: 0.6138062073498352


In [15]:
print("Missing values in train_features:")
print(train_features.isnull().sum())

Missing values in train_features:
GrLivArea         0
BedroomAbvGr      0
TotalBathrooms    0
dtype: int64


In [16]:
print("Missing values in train_target:")
print(train_target.isnull().sum())

Missing values in train_target:
0


In [17]:
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy="mean") 
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)

In [18]:
train_features = train_features.dropna()
train_target = train_target.loc[train_features.index]

In [19]:
print("Shape of train_features after handling missing values:", train_features.shape)
print("Shape of train_target after handling missing values:", train_target.shape)

Shape of train_features after handling missing values: (1460, 3)
Shape of train_target after handling missing values: (1460,)


In [20]:
model.fit(train_features, train_target)

In [23]:
test_features.dtypes

GrLivArea           int64
BedroomAbvGr        int64
TotalBathrooms    float64
dtype: object

In [28]:
test_features

Unnamed: 0,GrLivArea,BedroomAbvGr,TotalBathrooms
0,896,2,1.0
1,1329,3,1.5
2,1629,3,2.5
3,1604,3,2.5
4,1280,2,2.0
...,...,...,...
1454,1092,3,1.5
1455,1092,3,1.5
1456,1224,4,2.0
1457,970,3,1.5


In [31]:
test_predictions = model.predict(train_features)

In [44]:
len(test_data)

1459

In [35]:
len(test_predictions)

1460

In [46]:
test_data = np.random.rand(1459)  
if len(test_data) > len(test_predictions): 
    test_data = test_data[:len(test_predictions)] 
else: 
    test_predictions = test_predictions[:len(test_data)]

In [47]:
print(len(test_data))
print(len(test_predictions))

1459
1459


In [49]:
test_data = pd.DataFrame(test_data, columns=["Id"])

In [50]:
test_predictions = test_predictions[:len(test_data)]

In [51]:
output = pd.DataFrame({ 
    "Id": test_data["Id"], 
    "SalePrice": test_predictions 
})

In [52]:
output.to_csv("house_price_predictions.csv", index=False)
print("Predictions saved to 'house_price_predictions.csv'")

Predictions saved to 'house_price_predictions.csv'


In [58]:
output

Unnamed: 0,Id,SalePrice
0,0.661232,235812.971418
1,0.951687,161110.308994
2,0.757734,243397.918828
3,0.784097,191524.494773
4,0.501937,260604.255122
...,...,...
1454,0.070957,195926.189033
1455,0.355443,199534.055742
1456,0.884461,257045.378753
1457,0.864236,229789.040324
