In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import preprocessing 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import seaborn as sns
from tqdm import tqdm
from IPython.display import display

# pandas display options
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 300)
pd.set_option('float_format', '{:,.2f}'.format)

In [4]:
# datasets paths
# path = "../data"
path = "/content/drive/MyDrive/CZ4041/Dataset/" # path to Google Drive, for colab
macro = os.path.join(path, "macro.csv")
train = os.path.join(path, "train.csv")
test = os.path.join(path,  "test.csv")

# place all datasets paths in a datasets dict
datasets = {}
datasets['macro'] = macro
datasets['train'] = train
datasets['test'] = test


# load dataframes into dfs dict
dfs = {}
for dataset_name, path in datasets.items():
    df = pd.read_csv(path)
    dfs[dataset_name] = df

# assign to own df variables when you want to use them individually
df_macro = dfs['macro']
df_train = dfs['train']
df_test = dfs['test']

In [7]:
corr_vals = df_train.corr()
target_corr = abs(corr_vals["price_doc"])

relevant_features = target_corr[target_corr<=0.07]
print("Number of relevant features with corr <= 0.07: {}".format(len(relevant_features)))
print(relevant_features)

Number of relevant features with corr <= 0.25: 68
material                                0.06
build_year                              0.00
kitch_sq                                0.03
school_quota                            0.01
additional_education_raion              0.06
culture_objects_top_25_raion            0.04
full_all                                0.03
male_f                                  0.03
female_f                                0.02
16_29_all                               0.02
16_29_male                              0.02
16_29_female                            0.02
raion_build_count_with_material_info    0.06
build_count_block                       0.03
build_count_wood                        0.04
build_count_frame                       0.03
build_count_panel                       0.02
build_count_foam                        0.01
build_count_slag                        0.02
build_count_mix                         0.03
raion_build_count_with_builddate_info   0.06
build