In [None]:
House classification 

### 1. Data reading & first glance

In [31]:
import pandas as pd
url = "https://drive.google.com/file/d/1SxHrO6j5552c7uVUWKqqFKaSSkx06Gh8/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]


house = pd.read_csv(path)
house.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive'],
      dtype='object')

In [9]:
y = house.pop("Expensive")

As you can see, the target is just a bunch of zeros and ones. 1 means the house 
Expensive and 0 means it is not:

In [10]:
print(y.tolist())

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 

The "feature vector" (the columns used to predict the target) is denoted as X.

Some of the columns in the original dataset are unique identifiers of each passenger. Unique identifiers are not useful when it comes to finding patterns in the data. So, the first step when building our X, the vector of predictor features, is to drop them:

In [11]:
X = house
X.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,8450,65.0,856,3,0,2,0,0
1,9600,80.0,1262,3,0,2,298,0
2,11250,68.0,920,3,0,2,0,0
3,9550,60.0,756,3,0,3,0,0
4,14260,84.0,1145,4,0,3,192,0


3. Train-test split
Before performing a deeper exploration, we will split the dataset into 2 chunks: train and test. We will use the train set to find patterns in the data and create a model. The test set will remain untouched, unseen, unexplored. It will be the "reality check" for our model, it will let us know whether our model is able to generalize.

Here, we're using the function train_test_split to shuffle the observations randomly and create the train and test sets for both the X and the y objects at the same time, ensuring that they are shuffled the same way:

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [13]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
772,7819,94.0,1029,3,0,2,144,0
157,12003,92.0,774,4,0,3,0,0
360,7540,,888,2,0,2,0,192
744,5395,41.0,1337,2,0,2,96,0
150,10356,120.0,969,3,0,2,0,0


In [14]:
y_train.head()

772    0
157    1
360    0
744    0
150    0
Name: Expensive, dtype: int64

In [17]:
X_test.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
1253,17542,,1192,3,0,2,220,0
202,7000,50.0,617,2,0,1,0,0
213,13568,43.0,990,3,0,2,224,0
1331,10780,55.0,911,3,0,2,0,0
120,21453,,938,1,0,2,0,0


In [18]:
y_test.head()

1253    1
202     0
213     0
1331    0
120     0
Name: Expensive, dtype: int64

In [19]:
X_train.shape

(1168, 8)

In [20]:
X_test.shape

(292, 8)

4. Iteration 0 - An intuition-based model
Machine Learning models should never be the first option to solve any problem. Machine Learning adds complexity to a business, and it should only be implemented if it represents a clear advantage compared to a simple working solution that the company has build following simple rules, common sense or gained expertise.

Therefore, the first step when solving any problem should be to create a simple (even dumb) working solution and learn to evaluate it. In our example: how well can we predict survivalship with an extremely simple rule?

Let's start by suggesting that everyone buy not expensive house (I mean,everyone has not more money). We will call this "model" the "pessimistic" one, and use it to make predictions for both the train and the test set:

y_train.value_counts()

In [21]:
y_train.value_counts()

0    989
1    179
Name: Expensive, dtype: int64

In [22]:
len(y_train)

1168

Our predictions for the train set will be just 1168 zeros(not expensive):

In [23]:
pred_pessimistic_train = pd.Series(0, index=y_train.index)

In [25]:
pred_pessimistic_train.head()

772    0
157    0
360    0
744    0
150    0
dtype: int64

Now: how good are those predictions? We will compare them with the true values and find out the percentage of correctly predicted passengers. This metric is called accuracy:

In [26]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train
                                 )

round(train_accuracy, 2)

0.85

Okay, we have about a 85% accuracy in our train set. Let's find out how would have it done in our test set:

In [27]:
len(y_test)

292

In [28]:
pred_pessimistic_test = pd.Series(0, index=y_test.index)

In [29]:
pred_pessimistic_test.head()

1253    0
202     0
213     0
1331    0
120     0
dtype: int64

In [30]:
test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test
                               )

round(test_accuracy, 2)

0.87

The performance is pretty much the same. 