In [1]:
mounted = False
try:
    import google.colab
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

import pandas

def mount_drive():
    global mounted
    drive.mount("/content/drive", force_remount=True)
    mounted = True

def open_dataset(filepath):
    if IN_COLAB:
        filepath = '/content/drive/My Drive/UNI/CE101/{}'.format(filepath)
        if not mounted:
            mount_drive()
    data = pandas.read_csv(filepath)
    return data

train_dataset = open_dataset("train.csv")
test_dataset = open_dataset("test.csv")

#Prints info about the variables contained within both datasets.
print(train_dataset.info())
print(test_dataset.info())

#Determines the missing values from the test dataset
# and shows which columns have at least one missing value.
missing_val = test_dataset.isnull().sum()
print('Data missing from test dataset:\n',missing_val[missing_val > 0])

#Determines the missing values from the train dataset
# and shows which columns have at least one missing value.
missing_val = train_dataset.isnull().sum()
print('Data missing from train dataset:\n',missing_val[missing_val > 0])
missing_val = missing_val[missing_val > 0]
print(missing_val.plot.bar())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [None]:

#Describes all of the features of both datasets.
print(train_dataset.describe())
print(test_dataset.describe())


# Selects the variable that the model will determine. By convention,
# this is called y
y = train_dataset.SalePrice

# Creates a list of features to be selected from the dataset.
# This list is not complete, and all pre-processing of data should
# be completed before selecting features.
sales_features = ['Id','BedroomAbvGr', '1stFlrSF']
X = train_dataset[sales_features]
X_test = test_dataset[sales_features]

#Prints the basic description of the selected features from both the test and train dataset.
print(X.describe(),'\n', X_test.describe())

In [None]:
X.head()

In [None]:
if IN_COLAB and mounted:
     drive.flush_and_unmount()

In [26]:
#this is me testing and providing a template of the train_test_split

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

#picking some variables for the train_split part; these values can be changed after
sample_features = ['LotArea', 'YearBuilt', 'BedroomAbvGr', 'FullBath']
predict_val = train_dataset.SalePrice
X = train_dataset[sample_features]
y = predict_val

#spliting the data into training and validation data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

#I'm creating a DecisionTreeRegressor for test purposes but later we can replace this part with the proper regressor type
price_model = DecisionTreeRegressor(random_state = 1)
price_model.fit(train_X, train_y)

#predicting the values in the dataset portion, which we have not used in the training 
predicted_prices = price_model.predict(val_X)

#calculating mean absolute error, which can help us see the accuracy of our model
mae = mean_absolute_error(predicted_prices, val_y)
#here are the results, of course, when we pick proper variables, the MAE will be smaller
print("The mean average value is:", mae)
print("\nThe predicted values:", predicted_prices[0:5])
print("\nThe actual sales prices:\n", val_y.head(), sep="")

#this is how we should go about validating our models(seperating the dataset and checking by how much we're off)

The mean average value is: 40069.08127853882

The predicted values: [230000. 136500. 130000.  92000. 155000.]

The actual sales prices:
258     231500
267     179500
288     122000
649      84500
1233    142000
Name: SalePrice, dtype: int64
