Importing all the libraries and models used:

*   Pandas python library for data reading, analysis etc.
*   SKLearn for 2 out of the 3 models we used
*   XGBoost for our third model


In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model  import LinearRegression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


**Data processing**

In [0]:
#reading data:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
train.dropna(axis=0,subset=['SalePrice'],inplace=True)
Train_y = train.SalePrice   #The target columns without NaN
train.drop(['SalePrice'],axis=1,inplace=True)  #The predictive columns
#for category part:
drop_X = train.select_dtypes(include=['object'])
drop_test_X = test.select_dtypes(include=['object'])
#drop bad predictive value for LableEncoder
Object_Columns=[]
for columns in train.columns:
    if train[columns].dtype == 'object':
        Object_Columns.append(columns)
good_columns=[]
for columns in Object_Columns:
    if set(train[columns]) == set(test[columns]):
        good_columns.append(columns)
bad_columns = list(set(Object_Columns)-set(good_columns))
a=drop_X.drop(bad_columns,axis=1)
b=drop_test_X.drop(bad_columns,axis=1)

#fill the mising value using SimpleImputer:
SI= SimpleImputer(strategy='constant',fill_value='None')
SI_X = pd.DataFrame(SI.fit_transform(a))
SI_Test_X= pd.DataFrame(SI.fit_transform(b))
SI_X.columns = a.columns
SI_Test_X.columns = b.columns
#encode the category data:
LE=LabelEncoder()
for columns in set(good_columns):
     a[columns] = LE.fit_transform(SI_X[columns])
     b[columns] = LE.transform(SI_Test_X[columns])

#for numerical part:
number_X = train.select_dtypes(exclude=['object'])
number_test_X = test.select_dtypes(exclude=['object'])
#fill the mising value using SimpleImputer:
SI_Median = SimpleImputer(strategy='most_frequent')
SI_Median_X = pd.DataFrame(SI_Median.fit_transform(number_X))
SI_Median_test_X = pd.DataFrame(SI_Median.transform(number_test_X))
SI_Median_X.columns = number_X.columns
SI_Median_test_X.columns = number_test_X.columns

#combine numerical part and categorical part:
union_x=pd.concat([SI_Median_X,a],axis=1)
union_test_x=pd.concat([SI_Median_test_X,b],axis=1)


**Implementation of all three models**

In [0]:
#Using Linear Regression model:
bestm1=LinearRegression()
bestm1.fit(union_x,Train_y)
bestprediction1=bestm1.predict(union_test_x)
#RandomTreeRegressor

bestm2 = RandomForestRegressor(random_state=5)
print(bestm2)
bestm2.fit(union_x,Train_y)
bestprediction2=bestm2.predict(union_test_x)
#XGBOOST
train_x, x_val, train_y, y_val = train_test_split(union_x, Train_y, train_size=0.8, test_size=0.2,
                                                      random_state=7)
my_model = XGBRegressor(n_estimators=2000,objective ='reg:squarederror', learning_rate=0.1, n_jobs=10)
my_model.fit(train_x, train_y, 
             early_stopping_rounds=5, 
             eval_set=[(x_val, y_val)], 
             verbose=False)
bestprediction3=my_model.predict(union_test_x)

bestprediction=(bestprediction2+bestprediction3+bestprediction1)/3

**Outputing the predictions** 

In [7]:
predictionframe = pd.DataFrame({'Id':test.Id,
                       'SalePrice': bestprediction})
predictionframe.to_csv('submission.csv', index=False)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=5, verbose=0, warm_start=False)


  if getattr(data, 'base', None) is not None and \
