In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut


In [2]:
df_train = pd.read_csv("price_houses_full_train.csv")
df_test = pd.read_csv("price_houses_full_test.csv")

In [3]:
# print(df_train.info())

In [6]:
def toNumeric(x):
  #return x.map({"no":0,"yes":1})
  return x.map({"no":0,"yes":1, "furnished": 1, "semi-furnished": 0.5, "unfurnished": 0})


def convert_binary(data): 
    for column in list(data.select_dtypes(['object']).columns):
        data[[column]] = data[[column]].apply(toNumeric)
    return data

def convert_threevalues(df):
    furnishingstatus_map = {
    "furnished": 1,
    "semi-furnished": 0.5,
    "unfurnished": 0
    }
    
    df['furnishingstatus'] = df['furnishingstatus'].map(furnishingstatus_map)
    return df
    
data_size = len(df_train)

def train_and_test(data_train, data_test):
    #scaler = MinMaxScaler()
    scaler = MinMaxScaler(feature_range=[0,10000])


    scaled_train_x = data_train[['bedrooms','bathrooms','area','parking']]
    scaled_test_x = data_test[['bedrooms','bathrooms','area','parking']]
    train_binary_x = data_train[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']]
    test_binary_x = data_test[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']]
    train_values_y = data_train[['furnishingstatus']]
    test_values_y = data_test[['furnishingstatus']]
#     print(train_values_x.info())
#     print(train_values_x.head())

    train_binary_encoded_x = convert_binary(train_binary_x)
    test_binary_encoded_x = convert_binary(test_binary_x)
    train_status_encoded_y = convert_threevalues(train_values_y)
    test_status_encoded_y = convert_threevalues(test_values_y)
    print(train_status_encoded_y)
#     print(train_status_encoded_x.info())
#     print(train_status_encoded_x.head())

    train_full_x = pd.concat([scaled_train_x,train_binary_encoded_x],axis=1)  #Result is an array
    test_full_x = pd.concat([scaled_test_x,test_binary_encoded_x],axis=1)

#     print(train_full_x)
    scaled_train_y = data_train[['price']]
    scaled_test_y = data_test[['price']]

    scaler.fit(train_full_x)
    train_full_x = scaler.transform(train_full_x)
    scaled_test_x = scaler.transform(test_full_x)

    scaled_regression = LinearRegression()
    scaled_regression.fit(train_full_x,scaled_train_y)

    scaled_predictions = scaled_regression.predict(scaled_test_x)
    
#     print(scaled_regression.coef_)
#     print(scaled_regression.intercept_)
#     print("MAE", mean_absolute_error(scaled_test_y,scaled_predictions))
#     print("MSE", mean_squared_error(scaled_test_y,scaled_predictions))
    
    return mean_absolute_error(scaled_test_y, scaled_predictions), mean_squared_error(scaled_test_y, scaled_predictions)

# train_full_x, test_full_x, train_status_encoded_y, test_status_encoded_y = train_and_test(df_train, df_train)

In [7]:
# ##LeaveOneOut approach
# loo = LeaveOneOut()

# # print(type(scaled_test_x))
# # print(scaled_test_x)
# #For x
# for x_train, x_test in loo.split(train_full_x, y = None, groups=5):
#     print("train index of x:", x_train , "test index of x:", x_test)
#     print("train data of x:", train_full_x[x_train] , "test index of x:", train_full_x[x_test])


# #For y
# # n = 0
# # for y_train, y_test in loo.split(train_status_encoded_y):
# #     if n < 5:
# #         print("train index of y:", y_train , "test index of y:", y_test)
# #         print("train data of y:", train_status_encoded_y.loc[y_train] , "test index of y:", train_status_encoded_y.loc[y_test])
# #         n += 1
# #         print("n = ", n)
# #     else:
# #         break

In [8]:
#Leave-one-out approach
def split_train(data_train, nfolds):
    df_train_to_return = []
    df_test_to_return = []
    n_size = int((len(data_train) / nfolds)+1)
#     print(n_size)
    start = 0
    for l_inc in range(nfolds):
        end = start + n_size
        if end > len(data_train):
            end = len(data_train)
        test_filtered = data_train.iloc[start:end,:]
        train_filtered = data_train
        for l_index in range(start,end):
            train_filtered = train_filtered.drop([l_index])
        start = end
#         print("********************** head test", test_filtered.head())
#         print("********************** head train", train_filtered.head())
#         print("info test", len(test_filtered))
#         print("info train", len(train_filtered))
        df_train_to_return.append(train_filtered)
        df_test_to_return.append(test_filtered)

    return df_train_to_return, df_test_to_return

# splitted_train, splitted_test = split_train(df_train,5)
# print(type(splitted_train))

In [9]:
num_folds = 5
for index in range(num_folds):
    splitted_train, splitted_test = split_train(df_train, num_folds)
    MAE, MSE = train_and_test(splitted_train[index], splitted_test[index])
    print("MAE{0}:{1}".format(index+1, MAE))
    print("MSE{0}:{1}".format(index+1, MSE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a Da

     furnishingstatus
99                0.5
100               0.5
101               0.5
102               0.0
103               1.0
..                ...
486               1.0
487               0.5
488               0.5
489               1.0
490               0.5

[392 rows x 1 columns]
MAE1:943888.463085622
MSE1:1630817270078.775


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a Da

     furnishingstatus
0                 0.5
1                 0.5
2                 0.5
3                 0.5
4                 1.0
..                ...
486               1.0
487               0.5
488               0.5
489               1.0
490               0.5

[392 rows x 1 columns]
MAE2:853319.1223195628
MSE2:1148331553373.4631


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a Da

     furnishingstatus
0                 0.5
1                 0.5
2                 0.5
3                 0.5
4                 1.0
..                ...
486               1.0
487               0.5
488               0.5
489               1.0
490               0.5

[392 rows x 1 columns]
MAE3:843033.1884258891
MSE3:1368543331232.6318


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a Da

     furnishingstatus
0                 0.5
1                 0.5
2                 0.5
3                 0.5
4                 1.0
..                ...
486               1.0
487               0.5
488               0.5
489               1.0
490               0.5

[392 rows x 1 columns]
MAE4:872499.9564211814
MSE4:1213320957375.739
     furnishingstatus
0                 0.5
1                 0.5
2                 0.5
3                 0.5
4                 1.0
..                ...
391               0.0
392               1.0
393               0.5
394               0.0
395               0.5

[396 rows x 1 columns]
MAE5:782005.4298135951
MSE5:1227001320938.3005


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[[column]] = data[[column]].apply(toNumeric)
A value is trying to be set on a copy of a slice from a Da

In [None]:
# mainroad_map = {
#     "yes": 1,
#     "no": 0
# }

# guestroom_map = {
#     "yes": 1,
#     "no": 0
# }

# basement = {
#     "yes": 1,
#     "no": 0
# }

# hotwaterheating_map = {
#     "yes": 1,
#     "no": 0
# }
# airconditioning_map = {
#     "yes": 1,
#     "no": 0
# }
# prefarea_map = {
#     "yes": 1,
#     "no": 0
# }

# furnishingstatus_map = {
#     "furnished": 2,
#     "semi-furnished": 1,
#     "unfurnished": 0
# }

# df_train['mainroad'] = df_train['mainroad'].map(mainroad_map)
# df_train['guestroom'] = df_train['guestroom'].map(guestroom_map)
# df_train['basement'] = df_train['basement'].map(guestroom_map)
# df_train['hotwaterheating'] = df_train['hotwaterheating'].map(hotwaterheating_map)
# df_train['airconditioning'] = df_train['airconditioning'].map(airconditioning_map)
# df_train['prefarea'] = df_train['prefarea'].map(prefarea_map)
# df_train['furnishingstatus'] = df_train['furnishingstatus'].map(furnishingstatus_map)

# df_test['mainroad'] = df_test['mainroad'].map(mainroad_map)
# df_test['guestroom'] = df_test['guestroom'].map(guestroom_map)
# df_test['basement'] = df_test['basement'].map(guestroom_map)
# df_test['hotwaterheating'] = df_test['hotwaterheating'].map(hotwaterheating_map)
# df_test['airconditioning'] = df_test['airconditioning'].map(airconditioning_map)
# df_test['prefarea'] = df_test['prefarea'].map(prefarea_map)
# df_test['furnishingstatus'] = df_test['furnishingstatus'].map(furnishingstatus_map)


# X_train, X_test, y_train, y_test = df_train.iloc[:, 1:], df_test.iloc[:, 1:], df_train.iloc[:, 0], df_test.iloc[:, 0]

# reg = LinearRegression().fit(X_train, y_train)

# ##predict
# prediction = reg.predict(X_test)

# coef = reg.coef_
# intercept = reg.intercept_

# print("coef:", coef)
# print('intercept', intercept)
# x = np.arange(2275000, 12250000)
# y = x
# plt.figure(figsize = (20,8))
# plt.scatter(y_test, prediction, c="green", marker = "o")
# plt.plot(x, y, c="blue")
# plt.show()