# <center>Import libraries</center>

In [None]:
import os
import zipfile
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
from subprocess import check_output
import warnings 
from IPython.display import display
from pandas.api.types import CategoricalDtype
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
warnings.filterwarnings('ignore')
print('Setup Complet!')

In [None]:
plt.style.use(
    'seaborn-whitegrid'
)
plt.rc(
    'figure', 
    autolayout = True
)
plt.rc(
    'axes', 
    labelweight = 'bold', 
    labelsize = 'large', 
    titleweight = 'bold', 
    titlesize = 14, 
    titlepad = 10
)

print('Setup Complet!')

# <center>Data Cleaning</center>

In [None]:
print(check_output(["ls", "../input/sberbank-russian-housing-market"]).decode("utf8"))

In [None]:
dataset = 'train'
with zipfile.ZipFile('../input/sberbank-russian-housing-market/'+dataset+'.csv.zip','r') as z:
    z.extractall('.')

In [None]:
print(check_output(["ls", "train.csv"]).decode("utf8"))

In [None]:
df = pd.read_csv('./train.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(df.shape)
print(df.isnull().sum().sum())

In [None]:
#columns with null values;
col_with_nan = [col for col in df.columns if df[col].isnull().sum() > 4000]
df[col_with_nan].isnull().sum().sort_values(ascending=False)

In [None]:
corr_features = df.corr()['price_doc'].sort_values(ascending = False).head(50)
corr_features

In [None]:
plt.figure(figsize = (18, 10))
sns.heatmap(df[col_with_nan + ['price_doc']].corr());

In [None]:
droped_col = [col for col in col_with_nan if col != 'num_room']
df.drop(
    columns  = droped_col, inplace = True
)

In [None]:
print(len(droped_col))

In [None]:
# we've droped 33 features, we hope so, that will be fine and usefull
# we will rpeate the same with other null features to see some to grasp a new consept of these features;
#columns with null values;
col_with_nan = [col for col in df.columns if df[col].isnull().sum() > 0]
df[col_with_nan].isnull().sum().sort_values(ascending=False)

In [None]:
plt.figure(figsize = (20, 12))
sns.heatmap(df[col_with_nan + ['price_doc']].corr(), annot = True);

In [None]:
droped_col = [col for col in col_with_nan if col != 'num_room']
df.drop(
    columns  = droped_col, inplace = True
)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# fifty-four columns have to dropped, that great!
# now let's figure out non null features
# create a list with numerical & categorical variables;
cat_vars = [var for var in df.columns if df[var].dtypes == 'O']
print(cat_vars)

In [None]:
df[cat_vars]

In [None]:
## we don't need the id and times
df['id'].nunique() == df.shape[0]
df.drop(
    columns = ['id', 'timestamp'], 
    inplace = True
)

In [None]:
df.sub_area.unique()

In [None]:
df['sub_area'] = df['sub_area'].str.strip()
df['sub_area'] = df['sub_area'].str.lower()
df['sub_area'].unique()

In [None]:
df.product_type.unique()

In [None]:
df.ecology.unique()

In [None]:
cat_vars = [var for var in df.columns if df[var].dtypes == 'O']

In [None]:
def check_unique_values(var):
    if var not in ['ecology', 'product_type', 'sub_area']:
        uniques =  df[var].unique()
        print(uniques)
for var in cat_vars:
    check_unique_values(var)

In [None]:
#that's cool for inconsisten data entry;
#now let's go into deep;
# for numerical variables
num_vars = [var for var in df.columns if df[var].dtypes != 'O']
len(num_vars)
#that's huge amount of numerical data 🥶;

In [None]:
# Nominal variables;
nom_fea = ['sub_area']
df[nom_fea] = df[nom_fea].astype('category')
# Ordical variables;
orderd_levels = {
    'ecology':['not data', 'poor', 'satisfactory', 'good', 'excellent'], 
    'product_type': ['Investment', 'OwnerOccupier'], 
    'culture_objects_top_25':['no' 'yes'], 
    'thermal_power_plant_raion':['no' 'yes'], 
    'incineration_raion':['no' 'yes'], 
    'oil_chemistry_raion':['no' 'yes'], 
    'radiation_raion':['no', 'yes'], 
    'railroad_terminal_raion':['no' 'yes'], 
    'big_market_raion':['no' 'yes'], 
    'nuclear_reactor_raion':['no' 'yes'], 
    'detention_facility_raion':['no' 'yes'], 
    'water_1line':['no' 'yes'], 
    'big_road1_1line':['no' 'yes'], 
    'railroad_1line':['no' 'yes']
}
orderd_levels = {key: ['None'] + value for key, value in orderd_levels.items()}
# encoding processing
def encode(df):
    for name, levels in orderd_levels.items():
        
        df[name] = df[name].astype(CategoricalDtype(levels, ordered = True))
    return df
df = encode(df)

In [None]:
for name in df.select_dtypes('number'):
    df[name] = df[name].fillna(0)

# <center>Evaluate Dataset, RMSLE</center>

In [None]:
# start;
X = df.copy()
y = X.pop('price_doc')
model = XGBRegressor()
#hot encoding;
for col in X.select_dtypes('category'):
    X[col] = X[col].cat.codes
log_y = np.log(y)
log_y
score = cross_val_score(
    model, X, log_y, cv = 5, scoring = 'neg_mean_squared_error'
)
score = -1 * score.mean()
baseline_score = np.sqrt(score)
print(f'Baseline score: {baseline_score:.5f} RMSLE')

# <center>Comming Soon..........</center>