Author Yizhou Li, e-mail: lyzpp2000@163.com

Problem Statement:

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling<br>
or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences<br>
price negotiations than the number of bedrooms or a white-picket fence.<br>
<br>
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition<br>
challenges you to predict the final price of each home.

# 1. Data Preprocessing

In [53]:
# Step1: Loading necessary libraries and datasets.
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Step2: Functions definitions.
def load_data():
    train_df = pd.read_csv("input/train.csv")
    test_df = pd.read_csv("input/test.csv")
    my_df = train_df.merge(test_df, how='outer')
    return my_df

def drop_duplicates(my_df):
    my_df.drop_duplicates(inplace=True)
    my_df = my_df.loc[:,~my_df.columns.duplicated()]
    return my_df

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

def drop_columns_missing_above_80_percent(train_df, test_df) -> None :
    null_series = train_df.isnull().sum().sort_values(ascending=False)
    print("columns have null values:\n", null_series, "\n", sep="")
    temp_lst = []
    for i,v in null_series.items():
        if v / train_df.shape[0] >= 0.8:
            temp_lst.append(i)
    train_df.drop(temp_lst, axis=1, inplace=True)
    null_series = train_df.isnull().sum().sort_values(ascending=False)
    print("after we drop the columns that miss 80 percent values:\n", null_series, sep="")

    null_series = test_df.isnull().sum().sort_values(ascending=False)
    print("columns have null values:\n", null_series, "\n", sep="")
    temp_lst = []
    for i,v in null_series.items():
        if v / test_df.shape[0] >= 0.8:
            temp_lst.append(i)
    test_df.drop(temp_lst, axis=1, inplace=True)
    null_series = test_df.isnull().sum().sort_values(ascending=False)
    print("after we drop the columns that miss 80 percent values:\n", null_series, sep="")

# Step3: Load Data; Drop duplicates; Handle missing values.
my_df = load_data()
my_df = drop_duplicates(my_df)

# todo 

# test
# d = {'col1': [1, np.nan, 2], 'col2': [np.nan, 4, 2]}
# df = pd.DataFrame(data=d)

# imputer = SimpleImputer()
# d = imputer.fit_transform(df.select_dtypes(include="number"))
# df = pd.DataFrame(data=d, columns=df.columns)
# df
# print(my_df['LotFrontage'].isnull().sum())
# print(my_df["LotFrontage"].mean())
# my_df["LotFrontage"].head(10)
# print(my_df.info())

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
d = imputer.fit_transform(my_df.select_dtypes(include="number"))
my_numerical_df = pd.DataFrame(data=d, columns=my_df.select_dtypes(include="number").columns)

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
d = imputer.fit_transform(my_df.select_dtypes(exclude="number"))
my_categorical_df = pd.DataFrame(data=d, columns=my_df.select_dtypes(exclude="number").columns)

# my_df = my_numerical_df.merge(my_categorical_df, left_on=my_numerical_df.columns, right_on=my_categorical_df.columns)
# my_df.info()

# Step4: Encode categorical values into numerical type.
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
my_categorical_df = np.array(ct.fit_transform(my_categorical_df))
df = pd.DataFrame(data=my_categorical_df)
df

# Step5: Split dataset into training and testing sets.

# Step6: Feature scaling (Optional). dataset into training and testing sets.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,Reg,Lvl,AllPub,...,Attchd,RFn,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
1,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,Reg,Lvl,AllPub,...,Attchd,RFn,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
2,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,IR1,Lvl,AllPub,...,Attchd,RFn,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
3,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,IR1,Lvl,AllPub,...,Detchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Abnorml
4,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,IR1,Lvl,AllPub,...,Attchd,RFn,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,0.0,0.0,0.0,0.0,1.0,Pave,Grvl,Reg,Lvl,AllPub,...,Attchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
2915,0.0,0.0,0.0,0.0,1.0,Pave,Grvl,Reg,Lvl,AllPub,...,CarPort,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Abnorml
2916,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,Reg,Lvl,AllPub,...,Detchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Abnorml
2917,0.0,0.0,0.0,1.0,0.0,Pave,Grvl,Reg,Lvl,AllPub,...,Attchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal


In [None]:
fig, axs = plt.subplots(30, 3)
my_lst = [col for col in train_df.columns]
fig.set_size_inches(25, 200)
row = 30
col = 3
y = train_df["SalePrice"]
for i in range(row):
    for j in range(col):
        if len(my_lst) == 0:
            break
        x = train_df[my_lst.pop()]
        axs[i, j].scatter(x, y)
        axs[i, j].set_xlabel(x.name)
        axs[i, j].set_ylabel("SalePrice")
        axs[i, j].set_title(x.name + " vs SalePrice")

In [None]:
x_data = train_df[['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea',
                    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']]
y_data = train_df["SalePrice"]
# my_train_lst = [col for col in train_df.columns]
# my_test_lst = [col for col in test_df.columns]
# my_train_lst.pop()
# my_test_lst.pop()

# x_data = train_df[my_train_lst]
# test_data = test_df[my_test_lst]
test_data = test_df[ ['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea',
                    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

# 5. Models building, evaluation, and predicting


In [None]:
model = LinearRegression()
model2 = Lasso()
model3 = Ridge()
model4 = DecisionTreeRegressor()
model5 = RandomForestRegressor()

model.fit(X=x_train, y=y_train)
model2.fit(X=x_train, y=y_train)
model3.fit(X=x_train, y=y_train)
model4.fit(X=x_train, y=y_train)
model5.fit(X=x_train, y=y_train)

print("the accuracy score using with LinearRegression() model ", model.score(x_test, y_test))
print("the accuracy score using with Lasso() model ", model2.score(x_test, y_test))
print("the accuracy score using with Ridge() model ", model3.score(x_test, y_test))
print("the accuracy score using with DecisionTreeRegressor() model ", model4.score(x_test, y_test))
print("the accuracy score using with RandomForestRegressor() model ", model5.score(x_test, y_test))

model.fit(X=x_train, y=y_train)
result = model.predict(test_data)
temp = test_df
temp.reset_index(inplace=True)
metric = pd.Series(result, name = 'SalePrice')
final_metric = pd.concat([temp["Id"], metric], axis = 1)
final_metric.to_csv("submission.csv",index =False)

the accuracy score using with LinearRegression() model  0.699274404306837
the accuracy score using with Lasso() model  0.699255842859624
the accuracy score using with Ridge() model  0.6988298969020352
the accuracy score using with DecisionTreeRegressor() model  0.5258261751040558
the accuracy score using with RandomForestRegressor() model  0.647976119535729


ValueError: cannot insert level_0, already exists