# Считаем нашы датафреймы

In [130]:
import pandas as pd
import numpy as np

In [131]:
ls -lh ../input/

In [205]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
test_df_id = test_df.Id.values

# **Уберем ненужные фичужки и сгенерим бинарные из колонки 'features'**

In [133]:
def get_features(feature_list):
    return [feature for feature in feature_list.split('\'') if feature !=', ' and feature != ']' and feature != '[' and feature != '[]']
    

from collections import Counter
drop_columns = ['building_id', 'created', 'description', 'display_address', 'latitude', 'longitude', 'manager_id', 'photos', 'street_address', 'listing_id']
train_df.drop(drop_columns, axis=1, inplace=True)
test_df.drop(drop_columns, axis=1, inplace=True)
train_df['features'] = train_df['features'].apply(get_features)
test_df['features'] = test_df['features'].apply(get_features)

In [134]:
unique_features = []
for feature_list in train_df['features']:
    for feature in feature_list:
        unique_features.append(feature)
        
new_features = [new_feature[0] for new_feature in Counter(unique_features).most_common()[:47]]
for new_feature in new_features:
    train_df[new_feature] = 0

In [135]:
def find_feature(features, feature):
    if feature in features:
        return 1
    return 0

def fill_new_features(df):
    for new_feature in new_features:
        df[new_feature] = df.apply(
            lambda row: find_feature(row['features'], new_feature), axis=1)
        
fill_new_features(train_df)
fill_new_features(test_df)

In [136]:
def add_2_more_features(df):
    df['bedroom price'] = df['price'] / (df['bedrooms'] + 0.1)
    df['bathroom price'] = df['price'] / (df['bathrooms'] + 0.1)

add_2_more_features(train_df)
add_2_more_features(test_df)
train_df.drop('features', axis=1, inplace=True)
test_df.drop('features', axis=1, inplace=True)

In [137]:
train_df.describe()

# Выносим отдельно переменную Target

In [138]:
X_train = train_df.drop(['TARGET'], axis=1)
X_test = test_df

y_train = train_df.loc[:, 'TARGET']

In [139]:
def label_to_num(label):
    if label == 'low':
        return 0
    if label == 'medium':
        return 1
    return 2

y_train = y_train.apply(label_to_num).values

# Фитаем регрессию

In [140]:
from sklearn.linear_model import LogisticRegression

In [141]:
lg = LogisticRegression(multi_class='ovr', solver='lbfgs',penalty='l2',
                        class_weight={0:0.33, 2:2.9, 1:1.})

In [145]:
from sklearn.model_selection import cross_val_score

# Результат для исходных данных
cross_val_score(lg, X_train, y_train, scoring='accuracy', cv=5).mean()

# **Заюзаем SFS для лучших результатов**

In [204]:
from mlxtend.feature_selection import SequentialFeatureSelector

selector = SequentialFeatureSelector(LogisticRegression(multi_class='ovr', solver='lbfgs',penalty='l2',
                        class_weight={0:0.33, 2:2.9, 1:1.}), scoring='accuracy',
                                     verbose=2, k_features=45, forward=False, n_jobs=-1)

selector.fit(X_train, y_train)

In [207]:
train_features = selector.k_feature_names_

In [217]:
cross_val_score(LogisticRegression(multi_class='ovr', solver='lbfgs',penalty='l2',
                        class_weight={0:0.33, 2:2.9, 1:1.}, max_iter=1000), X_train[list(train_features)], y_train, scoring='accuracy', cv=5).mean()

**Вот такой результат дал SFS. Теперь проверим, что он не ограничил фичи до того, чтобы просто выдать больший класс *low***.

In [244]:
indexes = []
for cnt, label in enumerate(y_train):
    if label > 0 or cnt < 1000:
        indexes.append(cnt)
        
cross_val_score(LogisticRegression(multi_class='ovr', solver='lbfgs',penalty='l2',
                        class_weight={0:0.33, 2:2.9, 1:1.}, max_iter=1000), X_train[list(train_features)].iloc[indexes], y_train[indexes], scoring='accuracy', cv=5).mean()

In [None]:
def num_to_label(num):
    if num == 0:
        return 'low'
    if num == 1:
        return 'medium'
    return 'high'

lg = LogisticRegression(multi_class='ovr', solver='lbfgs',penalty='l2',
                        class_weight={0:0.33, 2:2.9, 1:1.})
lg.fit(X_train[list(train_features)], y_train)
y_pred = lg.predict(X_test[list(train_features)])
y_pred = np.stack(np.vectorize(num_to_label)(y_pred))

In [245]:
from collections import Counter
Counter(y_pred).most_common()

# Делаем сабмит

In [248]:
submit = pd.DataFrame.from_dict({'Id':test_df_id, 'TARGET': y_pred})
submit.to_csv("submit.csv", index=False)

In [254]:
correct_labels = submit[submit['Id'] == submit['TARGET']].shape[0]

In [257]:
print(f'Точность на тесте: {correct_labels / test_df.shape[0] * 100} %')