In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns

In [None]:
df_1 = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_2 = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
if 'SalePrice' not in df_2.columns:
    df_2['SalePrice'] = 0
df = pd.concat([df_1, df_2], axis = 0)
df = df.set_index('Id')

In [None]:
df.head()

In [None]:
df_null = df[df.isnull().sum()[df.isnull().sum()>0].index]

### For visualizing amount of null values

In [None]:
sns.heatmap(df_null.isnull())

### Dealing with strings & dropping useless columns
* Removing objects(strings) from the df and storing it in df_objects
* Fully removing columns which have null values > 1100 (eg alley)

In [None]:
df_objects = df[df.select_dtypes(include=['object']).columns]
df = df.drop(df[df_objects.isna().sum()[df_objects.isna().sum()>1100].index], axis = 1)

### One hot encoding

In [None]:
df_objects = df_objects.drop(df_objects[df_objects.isna().sum()[df_objects.isna().sum() > 1100].index], axis = 1)

df_objects = df_objects.fillna('null')

df_objects_encoded = pd.get_dummies(df_objects)

In [None]:
for i in df_objects_encoded.columns:
    if 'null' in i:
        df_objects_encoded = df_objects_encoded.drop(i, axis = 1)

In [None]:
new_df = pd.concat([df, df_objects_encoded], axis = 1)
len(new_df.columns)

In [None]:
len(df.columns), len(df_objects_encoded.columns)

In [None]:
new_df = new_df.drop(df.select_dtypes(include=['object']), axis = 1)
new_df.isna().sum()[new_df.isna().sum() > 0]

### Filling null values with mean mode

In [None]:
Mode_columns = ['GarageCars', 'GarageYrBlt', 'BsmtFullBath', 'BsmtHalfBath']
Mean_columns = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                'TotalBsmtSF', 'GarageArea']

for i in Mode_columns:
    new_df[i] = new_df[i].fillna(new_df[i].mode()[0])

for i in Mean_columns:
    new_df[i] = new_df[i].fillna(np.round(new_df[i].mean()))

new_df.isna().sum()[new_df.isna().sum() > 0]

In [None]:
training_data = new_df[0:len(df_1)]
testing_data = new_df[len(df_1):]
testing_data = testing_data.drop(columns='SalePrice')

### Importing models for comparing the error scores

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
X = training_data.drop(columns='SalePrice')
y = training_data['SalePrice']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)
Y_train = np.reshape(Y_train,(-1, 1))
Y_test = np.reshape(Y_test,(-1, 1))
X_train.shape, Y_train.shape

In [None]:
model_1 = LinearRegression()
model_1.fit(X, y)
y_pred = model_1.predict(X_test)
mean_squared_error(Y_test, y_pred)

In [None]:
model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.1)
model_2.fit(X, y)
y_pred = model_2.predict(X_test)
mean_squared_error(Y_test, y_pred)

In [None]:
model_3 = RandomForestRegressor(n_estimators=1000)
model_3.fit(X, y)
y_pred = model_3.predict(X_test)
mean_squared_error(Y_test, y_pred)

## Since XGBoost gives the lowest error score we will use it for prediction

In [None]:
pred = model_2.predict(testing_data)

final = pd.DataFrame()
final['Id'] = testing_data.index
final['SalePrice'] = pred

final.to_csv('output.csv', index=False)