# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np
import datatable as dt

nlinesfile = 3140000
nlinesrandomsample = 300000
lines2skip = np.random.choice(np.arange(1,nlinesfile+1), (nlinesfile-nlinesrandomsample), replace=False)

#train = pd.read_csv('../input/ubiquant-market-prediction/train.csv', skiprows=lines2skip)
train = dt.fread('../input/ubiquant-market-prediction/train.csv').to_pandas()
test = pd.read_csv('../input/ubiquant-market-prediction/example_test.csv')
sub = pd.read_csv('../input/ubiquant-market-prediction/example_sample_submission.csv')

display(train)
display(test)
display(sub)

In [None]:
None_values= train.isna().sum()
None_values[None_values > 0]

In [None]:
print('row_id unique len: ', len(train['row_id'].unique()))
print('time_id unique len: ', len(train['time_id'].unique()))
print('investment_id unique len: ', len(train['investment_id'].unique()))

In [None]:
train['time_id'] = train['time_id'].astype('int')
train['investment_id'] = train['investment_id'].astype('int')

In [None]:
NUM_FEATURES = list(train.loc[:,train.dtypes==np.int].columns)
NUM_FEATURES_2 = list(train.loc[:,train.dtypes==np.float].columns)
NUM_FEATURES.extend(NUM_FEATURES_2)

FEATURES = list(train.columns)
CAT_FEATURES = [feature for feature in FEATURES if feature not in NUM_FEATURES]

NUM_FEATURES.remove('target')
CAT_FEATURES.remove('row_id')
IGNORE_COLS = ['row_id']
print(CAT_FEATURES)
print(NUM_FEATURES)

# Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter("ignore")
fig, ax = plt.subplots(figsize=(24, 8))
sns.distplot(train['target'], bins=100, ax=ax, label='Target')
plt.legend();

In [None]:
fig, ax = plt.subplots(figsize=(24, 8))
sns.distplot(train['time_id'], bins=100, ax=ax, label='time_id')
ax.axvline(x = 320, ymin = 0, ymax = 1, color ='red', linestyle='--')
ax.axvline(x = 370, ymin = 0, ymax = 1, color ='red')
ax.axvline(x = 440, ymin = 0, ymax = 1, color ='red', linestyle='--')
ax.text(371, 0.001, '?????', color='r', fontsize=18)
plt.legend();

In [None]:
fig, ax = plt.subplots(figsize=(24, 8))
sns.distplot(train['investment_id'], bins=100, ax=ax, label='investment_id')
plt.legend();

# P-Values

In [None]:
from scipy.stats import pearsonr

p_values_list = []
for c in NUM_FEATURES:
    p = round(pearsonr(train.loc[:,'target'], train.loc[:,c])[1], 4)
    p_values_list.append(p)

p_values_df = pd.DataFrame(p_values_list, columns=['target'], index=NUM_FEATURES)
def p_value_warning_background(cell_value):
    highlight = 'background-color: lightcoral;'
    default = ''
    if cell_value > 0.05:
            return highlight
    return default

p_values_df_high = p_values_df[p_values_df['target'] > 0.03]
p_values_df_high.style.applymap(p_value_warning_background)

In [None]:
IMPORTANT_COLS = list(p_values_df[p_values_df['target'] > 0.05].index)

# Correlations

In [None]:
import gc

gc.collect()

n_cols = 15
nrows = round(len(NUM_FEATURES) / n_cols)
fig, axes = plt.subplots(nrows, 1, figsize=(48, 24))
plt.subplots_adjust(hspace=3)
NUM_FEATURES.remove('time_id')
NUM_FEATURES.remove('investment_id')
for i in range(nrows):
    feature_list = NUM_FEATURES[i*n_cols:(i+1)*n_cols].copy()
    feature_list.append('target')
    sns.heatmap(train.loc[:, feature_list].corr().iloc[-1:, :-1].abs(), annot=True, vmin=0, vmax=0.3, ax=axes[i]);

In [None]:
gc.collect()

all_feature_corr = train.corr()
target_corr = all_feature_corr.iloc[2, :]
IGNORE_COLS.extend(list(target_corr[target_corr.abs() < 0.025].index))

In [None]:
gc.collect()

upper = all_feature_corr.where(np.triu(np.ones(all_feature_corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
IGNORE_COLS.extend(to_drop)

In [None]:
IGNORE_COLS.remove('time_id')
IGNORE_COLS.remove('investment_id')

# Ingore Cols

In [None]:
print(IGNORE_COLS)

In [None]:
len(list(set(IGNORE_COLS)))

# Important Cols

In [None]:
print(IMPORTANT_COLS)

In [None]:
len(IMPORTANT_COLS)

# Important Note

Don't forget, the corr function finds only a linear relationship. Deleting columns according to corr score is not true. In this work, we deleted columns according to corr score because of data size.