In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
data = pd.read_csv("train.csv")

In [3]:
for column in data:
    if data[column].dtype in ["int64", "float64"]:
        if data[column].isnull().any():
            data[column] = data[column].fillna(data[column].mean())
    else:
        if data[column].isnull().any():
            data[column] = data[column].fillna(data[column].mode()[0])

In [4]:
data = pd.get_dummies(data, drop_first=True)

In [5]:
X = data.drop(columns='SalePrice')
y = data['SalePrice']

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = LinearRegression()

In [8]:
model.fit(X_train, y_train)

In [9]:
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)

In [10]:
print(f'Validation Mean Squared Error: {val_mse}')

Validation Mean Squared Error: 2641638813.2241855


Лінійна регресія показала малу точність прогнозування та надто високий MSE!Спробуємо Random Forest!

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [12]:
numeric_features = data.select_dtypes(include=["int64","float64"]).columns
categorical_features = data.select_dtypes(include=["object"]).columns

In [13]:
for col in numeric_features:
    data[col].fillna(data[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [14]:
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [15]:
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [16]:
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

val_mse = mean_squared_error(y_val, y_val_pred)
print(f'Validation Mean Squared Error: {val_mse}')


Validation Mean Squared Error: 821760492.6202474


In [18]:
test_data = pd.read_csv("test.csv")

In [19]:
test_data_prepared = pd.get_dummies(test_data, columns=categorical_features, drop_first=True)
test_data_prepared = test_data.reindex(columns=X_train.columns, fill_value=0)

y_test_pred = model.predict(test_data_prepared)


In [20]:
test_predictions = pd.DataFrame({'PredictedSalePrice': y_test_pred})
for i in test_predictions.iterrows():
    print(i)

(0, PredictedSalePrice    117202.33
Name: 0, dtype: float64)
(1, PredictedSalePrice    153011.0
Name: 1, dtype: float64)
(2, PredictedSalePrice    173201.86
Name: 2, dtype: float64)
(3, PredictedSalePrice    186180.9
Name: 3, dtype: float64)
(4, PredictedSalePrice    223479.5
Name: 4, dtype: float64)
(5, PredictedSalePrice    181135.74
Name: 5, dtype: float64)
(6, PredictedSalePrice    169073.1
Name: 6, dtype: float64)
(7, PredictedSalePrice    167208.04
Name: 7, dtype: float64)
(8, PredictedSalePrice    171644.51
Name: 8, dtype: float64)
(9, PredictedSalePrice    106306.43
Name: 9, dtype: float64)
(10, PredictedSalePrice    182806.15
Name: 10, dtype: float64)
(11, PredictedSalePrice    90898.5
Name: 11, dtype: float64)
(12, PredictedSalePrice    98330.33
Name: 12, dtype: float64)
(13, PredictedSalePrice    153032.17
Name: 13, dtype: float64)
(14, PredictedSalePrice    134035.83
Name: 14, dtype: float64)
(15, PredictedSalePrice    368426.61
Name: 15, dtype: float64)
(16, PredictedSaleP

In [21]:
test_data['PredictedSalePrice'] = y_test_pred

In [22]:
test_data.head
test_data.to_csv("test_data_predixt.csv",index=False)

  values = values.astype(str)
