In [29]:
import pandas as pd
df = pd.read_csv('houses_train.csv')

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [234]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

# Preprocessing

In [142]:
X = df[df.columns.difference(['Unnamed: 0', 'region', 'url', 'price'])]
y = df['price']

In [143]:
X.head()

Unnamed: 0,area,building_type,ceiling_height,condition,district,floor,max_floor,num_bathrooms,num_rooms,street
0,75.0,stone,3.0,newly repaired,Arabkir,3,5,1,2,Mamikoniants St
1,139.0,stone,2.8,newly repaired,Arabkir,1,5,1,3,Sundukyan St
2,80.0,stone,2.6,good,Erebuni,3,5,1,3,Tigran Mets Ավե (Erebuni)
3,90.0,panel,2.6,good,Nor Norq,6,9,1,4,Vilnyus St
4,86.0,other,2.8,good,Achapnyak,3,5,1,3,Halabyan St


In [271]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=11)

In [272]:
cat_columns = X_train.select_dtypes(include=['object']).columns

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(X_train[list(cat_columns)])
X_train[ohe.get_feature_names_out()] = ohe.transform(X_train[list(cat_columns)])
X_val[ohe.get_feature_names_out()] = ohe.transform(X_val[list(cat_columns)])

X_train.drop(cat_columns, axis=1, inplace=True)
X_val.drop(cat_columns, axis=1, inplace=True)

# this warning means that we are adding a lot of columns by hand, and it suggests to join dataframes instead. We can skip this warning for now.

In [275]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

MinMaxScaler()

# KNN

In [277]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
gs = GridSearchCV(knn, {'n_neighbors':range(1,10), 'weights':['uniform', 'distance'], 'p':[1,2]})
gs.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 10), 'p': [1, 2],
                         'weights': ['uniform', 'distance']})

In [278]:
gs.best_estimator_

KNeighborsRegressor(n_neighbors=8, p=1, weights='distance')

In [279]:
cross_val_score(gs.best_estimator_, X_train, y_train).mean()

0.6580675087637411

# Ridge

In [280]:
from sklearn.linear_model import Ridge
r = Ridge()
gs = GridSearchCV(r, {'alpha': [0.9, 0.95, 1, 2]})
gs.fit(X_train, y_train)

GridSearchCV(estimator=Ridge(), param_grid={'alpha': [0.9, 0.95, 1, 2]})

In [281]:
gs.best_estimator_

Ridge(alpha=2)

In [282]:
cross_val_score(gs.best_estimator_, X_train, y_train).mean()

0.7410511039217285

In [None]:
# We see that Ridge has a better result, so we choose it

In [283]:
gs.best_estimator_.score(X_val, y_val)

0.8082483165047498

In [284]:
y_pred = gs.best_estimator_.predict(X_val)

In [None]:
# Lets save models and preprocessors to use for the final prediction

In [None]:
pickle.dump(gs.best_estimator_, open('ridge.sav', 'wb'))
pickle.dump(ohe, open('ohe.sav', 'wb'))
pickle.dump(scaler, open('scaler.sav', 'wb'))

In [303]:
# We can also make a pipeline, to do all the steps in once (we will not use it as a final model here)

In [260]:
from sklearn.pipeline import make_pipeline
pl = make_pipeline(MinMaxScaler(), Ridge(alpha=2))
X_train_num = X_train.select_dtypes(exclude=['object'])
pl.fit(X_train_num, y_train)
pl.score(X_train_num, y_train)

In [295]:
def final_predict(final_test_df):
    # loading saved model and preprocessors
    loaded_model = pickle.load(open('ridge.sav', 'rb'))
    ohe = pickle.load(open('ohe.sav', 'rb'))
    scaler = pickle.load(open('scaler.sav', 'rb'))
    
    X_test = final_test_df[final_test_df.columns.difference(['Unnamed: 0', 'region', 'url'])]
    cat_columns = X_test.select_dtypes(include=['object']).columns
    X_test[ohe.get_feature_names_out()] = ohe.transform(X_test[list(cat_columns)])
    X_test.drop(cat_columns, axis=1, inplace=True)
    X_test = scaler.transform(X_test)
    return loaded_model.predict(X_test)

In [296]:
df_test = pd.read_csv('houses_test.csv')
y_test_pred = final_predict(df_test.drop('price', axis=1))
mean_squared_error(y_test_pred, df_test['price'], squared=False)

from sklearn.metrics import r2_score
r2_score(y_test_pred, df_test['price'])

In [None]:
# I will share with you test data soon, so you can check (with your final function) what result you have.