In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_columns', None)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Domain Background

Although the housing market is relatively stable in Russia, the country’s volatile economy makes forecasting prices as a function of apartment characteristics a unique challenge. Complex interactions between housing features such as a number of bedrooms and location are enough to make pricing predictions complicated. Adding an unstable economy to the mix means Sberbank and their customers need more than simple regression models in their arsenal.

# Datasets and Inputs

The basis for the investigation is a large number of economic indicators for pricing and prices themselves (train.csv and test.csv). Macroeconomic variables are collected in a separate file for transaction dates (macro.csv). In addition, the detailed description of variables is provided (data_dictionary.txt).

Due to the large number of features, We have chosen to analysize the following independent variables:

1. the dollar rate, which traditionally affects the Russian real estate market;
2. the distance in km from the Kremlin (the closer to the center of the city, the more expensive);
3. indicators characterizing the availability of urban infrastructure nearby (schools, medical and sports centers, supermarkets, etc.) ;
4. indicators of a particular living space (number of rooms, floor, etc.);
5. proximity to transport nodes (for example, to the metro);
6. indicators of population density and employment in the region of housing accommodation.

All these economic indicators have a strong influence on price formation and can be used as a basic set for regression analysis. 
Examples of numerical variables: the distance to the metro, the distance to the school, the dollar rate at the transaction moment, the area of the living space. 
Examples of categorical variables: neighborhoods, the nearest metro station, the number of rooms.


In [None]:
X_list_num = ['full_sq', 'num_room', 'area_m', 
              'kremlin_km', 'big_road2_km', 'big_road1_km',
              'workplaces_km',
              'stadium_km', 'swim_pool_km', 'fitness_km', 
              'detention_facility_km', 'cemetery_km',
              'radiation_km', 'oil_chemistry_km',
              'theater_km', 'exhibition_km', 'museum_km', 
              'park_km', 'public_healthcare_km',  
              'metro_min_walk','metro_km_avto', 
              'bus_terminal_avto_km', 'public_transport_station_min_walk',
              'railroad_station_walk_min', 'railroad_station_avto_km',
              'kindergarten_km', 'school_km', 'preschool_km',
              'university_km', 'additional_education_km',
              'shopping_centers_km', 'big_market_km',
              'ekder_all', 'work_all', 'young_all', 'ID_metro', 
              'office_raion', 'sport_objects_raion',
              'raion_popul', 'healthcare_centers_raion',
              'school_education_centers_raion', 
              'preschool_education_centers_raion']

X_list_cat = ['sub_area', 'ecology','big_market_raion', 'railroad_terminal_raion', 'timestamp','product_type']

In [None]:
train = zipfile.ZipFile('../input/sberbank-russian-housing-market/train.csv.zip', 'r')
test = zipfile.ZipFile('../input/sberbank-russian-housing-market/test.csv.zip', 'r')
macro_train = zipfile.ZipFile('../input/sberbank-russian-housing-market/macro.csv.zip', 'r')

train.extract('train.csv')
test.extract('test.csv')
macro_train.extract('macro.csv')

train = pd.read_csv('./train.csv',sep='\s*,\s*')
test = pd.read_csv('./test.csv',sep='\s*,\s*')
macro_train = pd.read_csv('./macro.csv')

In [None]:
train

In [None]:
macro_train

# Analysis of Macro.csv 

## Check NaN values in Macro.csv

In [None]:
cat_features_macro=[]

for col in macro_train:
    if not is_numeric_dtype(macro_train[col]):
        cat_features_macro.append(col)
cat_features_macro

In [None]:
# Replace all empty strings by NaN
macro_train = macro_train.replace('', np.nan)

# Holds the count of NaN values for all the features
empty_col_list = []

# Drop columns with more than 30% blank
for col in macro_train:
    if macro_train[col].isnull().values.any():
        empty_col_list.append((col, macro_train[col].isnull().sum()))
    if macro_train[col].isnull().sum() > 658:
        macro_train.drop(col, axis=1, inplace=True)        

macro_list = list(macro_train.columns)

# Drop empty rows
macro_train.dropna(axis=0, how='all', thresh=None, inplace=True)
# Drop rows with at least 50% empty
macro_train.dropna(axis=0, thresh=44, inplace=True)

# Add price_doc to dataset
temp = train
macro_train = temp.merge(macro_train, how='left', on='timestamp')
macro_train = macro_train[macro_list+["price_doc"]]
macro_train


In [None]:
# Display the feature correlation with the target
pearson_macro = macro_train.corr(method='pearson')
macro_corr_with_prices = pearson_macro["price_doc"][:-1]
macro_corr_with_prices[abs(macro_corr_with_prices).argsort()[::-1]]

In [None]:
# Display the most correlated features, in the 0.1 range
top32_macro_features = macro_corr_with_prices[abs(macro_corr_with_prices).argsort()[::-1]][:32].index.values.tolist()
print('The most correlated with prices:\n', top32_macro_features)

In [None]:
macro_corr_with_prices[abs(macro_corr_with_prices).argsort()[::-1]][:32]

In [None]:
macro_train_na_data = pd.DataFrame(macro_train[top32_macro_features].isnull().sum().to_numpy().reshape(1, 32), columns = [top32_macro_features])

macro_train_na_data.loc[:, (macro_train_na_data != 0).all()]

In [None]:
# Drop variables with too many NaN values and categorical ones cos they dun look useful

drop_features = ['incidence_population', 'unprofitable_enterpr_share','profitable_enterpr_share',
                'fin_res_per_cap', 'construction_value', 'grp', 'provision_doctors']

macro_train = macro_train[top32_macro_features+["price_doc","timestamp"]].drop(drop_features, axis=1)
macro_train

## Correlation Map of the features used in macro.csv

In [None]:
# Display the correlation matrix features used in macro
plt.figure(figsize=(18, 12))
sns.heatmap(macro_train.corr(), cmap="mako",
            xticklabels=macro_train.corr().columns.values,
            yticklabels=macro_train.corr().columns.values)
plt.title("Correlation Matrix (Features used in Macro) ", fontsize=20);

## Numeric exploration of the features

In [None]:
# Data exploration for numerical values of features in features_used
row = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
numeric_explore_macro = pd.DataFrame(index=row)

for feature in list(macro_train.columns):
    if is_numeric_dtype(macro_train[feature]):
        numeric_explore_macro[feature] = pd.Series(macro_train[feature].describe(), index=numeric_explore_macro.index)
        
numeric_explore_macro

# Analysis of Train.csv

## Analysis of target variable (price) in Test data

In [None]:
train['price_doc'].describe()

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))
sns.distplot(train['price_doc'].values, hist=True, kde=True, bins=100, ax=ax1, color = 'darkblue')
plt.title('Distribution plot of price_doc', fontsize=18)
ax2.set_xlabel("Prices")
ax1.set_ylabel("Distribution")

sns.distplot(np.log(train['price_doc'].values), bins=100, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")
ax2.set_ylabel("Distribution")

plt.suptitle('Sberbank Russian Housing Data');

In [None]:
# Create the table of descriptive statistics
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(train['price_doc']))
print ("Minimum house price = ", np.min(train['price_doc']))
print ("Maximum house price = ", np.max(train['price_doc']))
print ("Mean house price = ", "%.2f" % np.mean(train['price_doc']))
print ("Median house price = ", "%.2f" % np.median(train['price_doc']))
print ("Standard deviation of house prices =", "%.2f" % np.std(train['price_doc']))

## Correlation map

In [None]:
# Display the correlation matrix of features
plt.figure(figsize=(20, 20))
sns.heatmap(train.corr(), cmap='viridis',
            xticklabels=train.corr().columns.values,
            yticklabels=train.corr().columns.values)
plt.title("Correlation Matrix (All Features)", fontsize=20);

In [None]:
# Display the feature correlation with the target
pearson = train.corr(method='pearson')
corr_with_prices = pearson["price_doc"][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]

In [None]:
# Display the most correlated features
top50_features = corr_with_prices[abs(corr_with_prices).argsort()[::-1]][:50].index.values.tolist()
print('The most correlated with prices:\n', top50_features)


In [None]:
# Display the correlation matrix of top 50 features
plt.figure(figsize=(18, 12))
sns.heatmap(train[top50_features+["price_doc"]].corr(), cmap="mako",
            xticklabels=train[top50_features+["price_doc"]].corr().columns.values,
            yticklabels=train[top50_features+["price_doc"]].corr().columns.values)
plt.title("Correlation Matrix (Top 50) ", fontsize=20);

In [None]:
# Display correlation value of selected features
selected_features = X_list_num+X_list_cat

for x in selected_features:
    if x not in top50_features:
        try:
            print(f"{x}: {corr_with_prices[x]}")
            if abs(corr_with_prices[x]) < 0.1:
                selected_features.remove(x)
        except:
            continue         

As seen above, although the correlation value is not within the top 50, it is still relatively high. Thus we shall consider all features that has correlation more than 0.1 in our model. 

In [None]:
features_used = selected_features+top50_features
features_used = list(set(features_used))
print(len(features_used))
features_used

In [None]:
# Display the correlation matrix of features_used
plt.figure(figsize=(18, 12))
sns.heatmap(train[features_used+["price_doc"]].corr(), cmap="magma",
            xticklabels=train[features_used+["price_doc"]].corr().columns.values,
            yticklabels=train[features_used+["price_doc"]].corr().columns.values)
plt.title("Correlation Matrix (Features Used)", fontsize=20);

## Determine amount of NA values in selected features

In [None]:
train_na_data = pd.DataFrame(train[features_used].isnull().sum().to_numpy().reshape(1, 81), columns = [features_used])

train_na_data.loc[:, (train_na_data != 0).all()]

In [None]:
test_na_data = pd.DataFrame(test[features_used].isnull().sum().to_numpy().reshape(1, 81), columns = [features_used])

test_na_data.loc[:, (test_na_data != 0).all()]

## Data Exploration of numerical features used

In [None]:
# Data exploration for numerical values of features in features_used
row = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
numeric_explore = pd.DataFrame(index=row)

for feature in features_used:
    if is_numeric_dtype(train[feature]):
        numeric_explore[feature] = pd.Series(train[feature].describe(), index=numeric_explore.index)
        
numeric_explore

## Data Exploration of categorical feature used

### Feature: Timestamp

In [None]:
train['year_month'] = train['timestamp'].apply(lambda x: x[:4] + x[5:7]).astype(int)
train['month'] = train['timestamp'].apply(lambda x: x[5:7]).astype(int)
train.drop('timestamp', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='year_month', y='price_doc', data=train)
plt.title('Prices vs year_month', fontsize=18)
plt.xticks(rotation='vertical')
plt.title("Effect of year_month on prices", fontsize=20);

As seen, there is an increasing trend in prices of the russian housing as the years increases.

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='month', y='price_doc', data=train)
plt.xticks(rotation='vertical')
plt.title("Effect of month on prices", fontsize=20);

In the second half of the year (and especially in October and November) prices are lower.

### Feature: sub_area

In [None]:
plt.figure(figsize=(10, 30))
sns.barplot(y='sub_area', x='price_doc', data=train, orient='h', palette='light:#5A9', estimator=np.median)
plt.title('Prices depending on sub-area', fontsize=18)

The prices of the housing varies greatly depending on the subarea.

### Features: big_market_raion, railroad_terminal_raion, ecology, product_type

In [None]:
categorical_feats = ['ecology','big_market_raion', 'railroad_terminal_raion', 'product_type']
nr_rows = 2
nr_cols = 2

fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*5,nr_rows*3))

for r in range(0,nr_rows):
    for c in range(0,nr_cols):  
        i = r*nr_cols+c
        i_col = categorical_feats[i]
        sns.countplot(x=i_col, data=train,ax = axs[r][c])

fig.suptitle('Countplots for categorical features', y=1.02, fontsize=18)  
plt.tight_layout()    

Most selected categorical values are binary in nature. For features big_market_raion and railroad_terminal_raion, the data is very baised.