Author Yizhou Li, e-mail: lyzpp2000@163.com

Problem Statement:

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling<br>
or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences<br>
price negotiations than the number of bedrooms or a white-picket fence.<br>
<br>
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition<br>
challenges you to predict the final price of each home.

# 1. Data Preprocessing

In [17]:
# Loading necessary libraries and datasets.
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

# Functions definitions.
# def load_data():
#     train_df = pd.read_csv("input/train.csv")
#     test_df = pd.read_csv("input/test.csv")
#     my_df = train_df.merge(test_df, how='outer')
#     return my_df

def drop_duplicates(my_df):
    my_df.drop_duplicates(inplace=True)
    my_df = my_df.loc[:,~my_df.columns.duplicated()]
    return my_df

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

def drop_columns_missing_above_80_percent(train_df, test_df) -> None :
    null_series = train_df.isnull().sum().sort_values(ascending=False)
    print("columns have null values:\n", null_series, "\n", sep="")
    temp_lst = []
    for i,v in null_series.items():
        if v / train_df.shape[0] >= 0.8:
            temp_lst.append(i)
    train_df.drop(temp_lst, axis=1, inplace=True)
    null_series = train_df.isnull().sum().sort_values(ascending=False)
    print("after we drop the columns that miss 80 percent values:\n", null_series, sep="")

    null_series = test_df.isnull().sum().sort_values(ascending=False)
    print("columns have null values:\n", null_series, "\n", sep="")
    temp_lst = []
    for i,v in null_series.items():
        if v / test_df.shape[0] >= 0.8:
            temp_lst.append(i)
    test_df.drop(temp_lst, axis=1, inplace=True)
    null_series = test_df.isnull().sum().sort_values(ascending=False)
    print("after we drop the columns that miss 80 percent values:\n", null_series, sep="")

# Load Data.
train_df = pd.read_csv("input/train.csv")
test_df = pd.read_csv("input/test.csv")
# my_df = train_df.merge(test_df, how='outer')

# Drop duplicates.
train_df = drop_duplicates(train_df)
test_df = drop_duplicates(test_df)

# Handle missing values.

# determine categorical and numerical features
numerical_ix = my_df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = my_df.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('imputer_num', SimpleImputer(strategy='mean'), numerical_ix), ('imputer_cat', SimpleImputer(strategy='most_frequent'), categorical_ix), ('encoder', OneHotEncoder(), categorical_ix)]

col_transform = ColumnTransformer(transformers=t, remainder="passthrough")
my_df = col_transform.fit_transform(my_df)
my_df = pd.DataFrame(data=my_df)
my_df
# # define the model
# model = SVR(kernel='rbf',gamma='scale',C=100)
# # define the data preparation and modeling pipeline
# pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# # define the model cross-validation configuration
# cv = KFold(n_splits=10, shuffle=True, random_state=1)
# # evaluate the pipeline using cross validation and calculate MAE
# scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# # convert MAE scores to positive values
# scores = absolute(scores)
# # summarize the model performance
# print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

# Split dataset into training and testing sets.

# Feature scaling (Optional). dataset into training and testing sets.


0       208500.0
1       181500.0
2       223500.0
3       140000.0
4       250000.0
          ...   
2914         NaN
2915         NaN
2916         NaN
2917         NaN
2918         NaN
Name: SalePrice, Length: 2919, dtype: float64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,346,347,348,349,350,351,352,353,354,355
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915.0,160.0,21.0,1936.0,4.0,7.0,1970.0,1970.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2915,2916.0,160.0,21.0,1894.0,4.0,5.0,1970.0,1970.0,0.0,252.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2916,2917.0,20.0,160.0,20000.0,5.0,7.0,1960.0,1996.0,0.0,1224.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2917,2918.0,85.0,62.0,10441.0,5.0,5.0,1992.0,1992.0,0.0,337.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
fig, axs = plt.subplots(30, 3)
my_lst = [col for col in train_df.columns]
fig.set_size_inches(25, 200)
row = 30
col = 3
y = train_df["SalePrice"]
for i in range(row):
    for j in range(col):
        if len(my_lst) == 0:
            break
        x = train_df[my_lst.pop()]
        axs[i, j].scatter(x, y)
        axs[i, j].set_xlabel(x.name)
        axs[i, j].set_ylabel("SalePrice")
        axs[i, j].set_title(x.name + " vs SalePrice")

In [None]:
x_data = train_df[['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea',
                    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']]
y_data = train_df["SalePrice"]
# my_train_lst = [col for col in train_df.columns]
# my_test_lst = [col for col in test_df.columns]
# my_train_lst.pop()
# my_test_lst.pop()

# x_data = train_df[my_train_lst]
# test_data = test_df[my_test_lst]
test_data = test_df[ ['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea',
                    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

# 5. Models building, evaluation, and predicting


In [None]:
model = LinearRegression()
model2 = Lasso()
model3 = Ridge()
model4 = DecisionTreeRegressor()
model5 = RandomForestRegressor()

model.fit(X=x_train, y=y_train)
model2.fit(X=x_train, y=y_train)
model3.fit(X=x_train, y=y_train)
model4.fit(X=x_train, y=y_train)
model5.fit(X=x_train, y=y_train)

print("the accuracy score using with LinearRegression() model ", model.score(x_test, y_test))
print("the accuracy score using with Lasso() model ", model2.score(x_test, y_test))
print("the accuracy score using with Ridge() model ", model3.score(x_test, y_test))
print("the accuracy score using with DecisionTreeRegressor() model ", model4.score(x_test, y_test))
print("the accuracy score using with RandomForestRegressor() model ", model5.score(x_test, y_test))

model.fit(X=x_train, y=y_train)
result = model.predict(test_data)
temp = test_df
temp.reset_index(inplace=True)
metric = pd.Series(result, name = 'SalePrice')
final_metric = pd.concat([temp["Id"], metric], axis = 1)
final_metric.to_csv("submission.csv",index =False)

the accuracy score using with LinearRegression() model  0.699274404306837
the accuracy score using with Lasso() model  0.699255842859624
the accuracy score using with Ridge() model  0.6988298969020352
the accuracy score using with DecisionTreeRegressor() model  0.5258261751040558
the accuracy score using with RandomForestRegressor() model  0.647976119535729


ValueError: cannot insert level_0, already exists