In [None]:
!pip install dill

In [None]:
import numpy as np
import pandas as pd
import dill
import random

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score
#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных

In [None]:
TRAIN_DATASET_PATH = '/content/train.csv'
TEST_DATASET_PATH = '/content/test.csv'

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности
* **Social_1, Social_2, Social_3** - социальные показатели местности
* **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры


In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.tail()

**Задача:** разработать модель, которая будет предсказывать цену на недвижимость.

In [None]:
# Данные без целевого показателя.
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.tail()

# **Разделяем данные на train и test и сохраним на диск**

In [None]:
X = train_df.drop(columns='Price')
y = train_df['Price']

In [None]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.kitchen_square_quantile = None

    def fit(self, X):
        """Сохранение статистик"""
        # Расчет медиан
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)

    def transform(self, X):
        """Трансформация данных"""

        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1

        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
          # KitchenSquare
        condition = (X['KitchenSquare'].isna()) \
                    | (X['KitchenSquare'] > self.kitchen_square_quantile)

        X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']

        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3

        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1

        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']

        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor'] = X['Floor']

        # HouseYear
        current_year = datetime.now().year
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1

        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year

        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)

        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())

        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3


        X.fillna(self.medians, inplace=True)

        return X

In [None]:
train_df.isna().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     4798
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=21)

# save test

X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train

X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

# **Выбираем показатели и собираем pipeline**



Выбираем показатели, по которым будем строить модель, заведем новые показатели, которые могут коррелируются с целевым показателем

In [None]:
feature_names = ['Rooms', 'Square', 'KitchenSquare']


target_name = 'Price'

Соберем кусок, ответственный за feature engineering

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.key]]

In [None]:
#combine
Rooms_tr = []
Square_tr = []
KitchenSquare_tr = []


Rooms_t = Pipeline([
                 ('selector', NumberSelector(key='Rooms'))
            ])
Rooms_tr.append(('Rooms', Rooms_t))
Square_t = Pipeline([
                 ('selector', NumberSelector(key='Square'))
            ])
Square_tr.append(('Square', Square_t))
KitchenSquare_t = Pipeline([
                 ('selector', NumberSelector(key='KitchenSquare'))
            ])
KitchenSquare_tr.append(('KitchenSquare', KitchenSquare_t))
feats = FeatureUnion(Rooms_tr + Square_tr + KitchenSquare_tr)
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

array([[ 1.        , 43.38079757,  1.        ],
       [ 3.        , 83.5963648 ,  1.        ],
       [ 2.        , 53.61058324,  8.        ],
       ...,
       [ 2.        , 45.05389835,  5.        ],
       [ 1.        , 40.33798661,  8.        ],
       [ 3.        , 63.54359087,  6.        ]])

Добавляем модель в pipeline

In [None]:
%%time
classifier = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression())
])

CPU times: user 15 µs, sys: 3 µs, total: 18 µs
Wall time: 19.6 µs


In [None]:
classifier.fit(X_train, y_train)

ValueError: ignored

In [None]:
# кроссвалидация
cv_scores = cross_val_score(classifier, X_train, y_train, cv=6)
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print ('CV score is {}+-{}'.format(cv_score,cv_score_std))

# обучение pipeline на тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict (X_test)[:,1]

ValueError: ignored

In [None]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('Rooms',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Rooms'))])),
                                 ('Square',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Square')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('KitchenSquare',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='KitchenSquare')),
                                                  ('standard',
                                                   StandardScaler())]))])),
 ('classifier', LogisticRegression(C=10, random_state=42))]

In [None]:
## Сохраним модель (пайплайн)

with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

# **Flask**

In [None]:
!pip install flask-ngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify

In [None]:
cat /root/.ngrok2/ngrok.yml

cat: /root/.ngrok2/ngrok.yml: No such file or directory


In [None]:
class ClientDataForm(FlaskForm):
   Rooms = StringField('Rooms', validators=[DataRequired()])
   Square = StringField('Square', validators=[DataRequired()])
   KitchenSquare = StringField('KitchenSquare', validators=[DataRequired()])

In [None]:
app = Flask(__name__)
app.config.update(
    CSRF_ENABLED=True,
    SECRET_KEY='you-will-never-guess',
)

In [None]:
def get_prediction(Rooms, Square, KitchenSquare):
    body = {'Rooms': Rooms,
            'Square': Square,
            'KitchenSquare': KitchenSquare}

    myurl = "http://0.0.0.0:8180/predict"
    req = urllib.request.Request(myurl)
    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(body)
    jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))
    #print (jsondataasbytes)
    response = urllib.request.urlopen(req, jsondataasbytes)
    return json.loads(response.read())['predictions']

In [None]:
@app.route("/")
def index():
    return render_template('index.html')

In [None]:
@app.route('/predicted/<response>')
def predicted(response):
    response = json.loads(response)
    print(response)
    return render_template('predicted.html', response=response)

In [None]:
@app.route('/predict_form', methods=['GET', 'POST'])
def predict_form():
    form = ClientDataForm()
    data = dict()
    if request.method == 'POST':
        data['Rooms'] = request.form.get('Rooms')
        data['Square'] = request.form.get('Square')
        data['KitchenSquare'] = request.form.get('KitchenSquare')


        try:
            response = str(get_prediction(data['Rooms'],
                                      data['Square'],
                                      data['KitchenSquare'])

            print(response)
        except ConnectionError:
            response = json.dumps({"error": "ConnectionError"})
        return redirect(url_for('predicted', response=response))
    return render_template('form.html', form=form)']))
            print(response)
        except ConnectionError:
            response = json.dumps({"error": "ConnectionError"})
        return redirect(url_for('predicted', response=response))
    return render_template('form.html', form=form)

In [None]:
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8181, debug=True)