In [70]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import math as m
from sklearn import svm

In [2]:
# bokeh plotting
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.layouts import row, column
from bokeh.models import PrintfTickFormatter
from bokeh.models import CustomJS, DateRangeSlider
from bokeh.models import Legend, ColumnDataSource, Label, LabelSet, Range1d
from bokeh.palettes import magma, viridis
output_notebook(hide_banner=True)


import matplotlib.pyplot as plt

# Utils

In [3]:
def plot_joint_return(values, mode = 'pyplot'):
    colors = ['goldenrod', 'purple', 'orange', 'olive', 'red', 'darkolivegreen', 'olivedrab', 'lightblue',\
              'lightblue', 'pink','blue']
    
    names = values.columns
    if mode == 'bokeh':
        for i in range(len(names)):
            for j in range(i+1, len(names)):
                    p = figure(plot_height=400, plot_width=400, title="")
                    ind_color = (i+j) % len(colors)
                    p.x(values[names[i]], values[names[j]], color=colors[ind_color], alpha=1, legend_label = names[i]+'/'+names[j], 
                           muted_color=colors[ind_color], muted_alpha=0.01)
                    p.legend.location = "top_left"
                    p.legend.click_policy = "mute"
                    show(p)
                    
    if mode == 'pyplot':
        n = len(names)
        num_row = max(int(m.ceil(0.25*(n-1)*n)), 1)
        num_col = 2
        
        fig, axs = plt.subplots(num_row, 2, figsize=(12,n*(n-1)))
        count = 0
        for i in range(n):
            for j in range(i+1, n):              
                row = count // num_col
                col = count % num_col
                ind_colors = count % len(colors)
                count += 1
                axs[row, col].scatter(values[names[i]], values[names[j]], s = 0.75, c = colors[ind_colors])
                axs[row, col].set_title(names[i] + "/" + names[j])
                axs[row, col].grid()
                axs[row, col].plot()

In [4]:
def plot_hist(array, bins, name, law = None, name_law = '', savefig = None):
    mean, std = np.mean(array), np.std(array)
    m, M = np.min(array), np.max(array)
    q10, q90 = np.quantile(array, q=0.1),np.quantile(array, q=0.9)
    
    dico = {
        'Mean' : mean,
        'Standard deviation' : std,
        'Minimum' : m,
        'Maximum' : M,
        'Quantile 0.1':q10,
        'Quantile 0.9':q90
    }
    
    N = array.shape[0]
    
    fig, ax = plt.subplots(figsize = (15,10))

    if(law!=None):
        x = np.arange(m, M)
        y = law(x, mean, std) # scale = 1/lambda = Esp(U)
        ax.plot(x,y, label = name_law)
        
    ax.hist(array, bins = bins , density = True)
    
    ax.set_title('Law of {} ; {} draws.'.format(name, N), fontsize=12)
    ax.set_xlabel(name, fontsize=10)
    ax.set_ylabel('quantity', fontsize=10)
    
    for key, value in dico.items():
        ax.plot([],[], '.', label = '{} : {}'.format(key, value), color = 'k')
        
    plt.legend(loc='upper left', fontsize='x-large', title_fontsize='x-small', ncol=1, bbox_to_anchor=(1.05, 1))

    if(savefig!=None):
        plt.savefig(savefig+'.png', dpi = 40)
    else:
        plt.show()

In [5]:
def plot_data(values, pred = None):
    p = figure(plot_height=450, plot_width=900, title="Log return", x_axis_type='datetime')
    
    colors = ['goldenrod', 'purple', 'orange', 'olive', 'olivedrab', 'lightblue',\
              'lightblue', 'pink', 'darkolivegreen', 'red', 'blue']
    names = values.columns[1:]
    box_colors = dict(zip(names, colors[0: len(names)]))
    
    for name in names:
        p.line(values[values.columns[0]], values[name], color=box_colors[name], alpha=1, legend_label = name,  muted_color=box_colors[name],
               muted_alpha=0.01)

    p.legend.location = "top_left"
    p.legend.click_policy = "mute"
    
    show(p)

# Drop

In [6]:
df = pd.read_csv("./_data/Исходные данные.csv", delimiter = ';', decimal=',').drop(columns= ['Unnamed: 0', 'nplv'])
df['DT'] = pd.to_datetime(df['DT'])

### Дроп пустых записей

In [7]:
df = df.loc[~df['t под током'].isnull()]

### Дроп колонок с большим числом пропусков

In [8]:
nuls = df.isnull().mean().to_frame().sort_values(by=0, ascending = False)
drop_col = nuls.index[0:10]

In [9]:
df = df.drop(columns = drop_col)

### Дроп сильно коррелированных данных 

In [10]:
pearson_corr = df.corr(method='pearson', min_periods=50)

In [11]:
pos_corr = np.abs(np.nan_to_num(pearson_corr.values, 0.0))
strong_corr = []
drop_col = []
for i in range(1, len(pos_corr)):
        if np.max(pos_corr[i, :i]) > 0.9:
            j = np.argmax(pos_corr[i, :i])
            strong_corr.append((i, j, pearson_corr.values[i, j]))
            drop_col.append(pearson_corr.columns[i])

In [12]:
print(np.array(strong_corr))

[[ 3.          1.          0.99324385]
 [21.          5.          0.91879086]
 [24.          9.          0.99571263]
 [25.          8.          0.93568965]
 [26.          7.          0.99844589]
 [27.          6.          0.99820447]
 [28.          2.          0.95936058]
 [45.         31.          0.95356955]
 [46.         32.          0.99067039]
 [50.         36.          0.9637086 ]
 [51.         37.          0.95663559]
 [53.         39.          0.90366969]
 [55.          9.          0.99386886]]


In [13]:
df = df.drop(columns = drop_col)

### Дроп колонок с одним значением

In [14]:
drop_col = []
for col in df.columns:
    if len(np.unique(df[col].values)) == 1:
        drop_col.append(col)

In [15]:
df = df.drop(columns = drop_col)

# Все марки стали

In [16]:
steel_grades = np.unique(df['МАРКА'].values)

In [17]:
grades_freq = df.groupby('МАРКА').count()['DT'].values
ind = np.argsort(grades_freq)
steel_grades = steel_grades[ind][::-1]

In [18]:
df_grades = [df[df['МАРКА'] == grade].reset_index(drop = True) for grade in steel_grades]

# Определение выбросов

### C помощью медианы и MAD

In [19]:
def median_outliers(df, threshold = 3.0, window_size = 200):
    median = np.nan_to_num(df.median().values, 0)
    values = np.nan_to_num(df.values - median, 0)
    mad = np.nan_to_num(df.mad().values, 0)
    mad = np.where( mad == 0.0, 1.0, mad)
    
    normal = np.where( np.max(np.abs(values) / mad, axis=1) < threshold)[0]
    outliers = np.where( np.max(np.abs(values) / mad, axis=1) >= threshold)[0]
    return normal, outliers

In [26]:
inds_outliers = median_outliers(df_grades[0][df.columns[3:]])

In [33]:
inds_outliers[1]

array([   0,    1,    2, ..., 4420, 4424, 4428])

### Критерий Граббса

In [45]:
def compute_test_grabs(df):
    mean = np.nan_to_num(df.mean().values, 0)
    std = np.nan_to_num(df.std().values, 1)
    values = np.nan_to_num(df.mean, 0)
    
    

In [60]:
mean = np.nan_to_num(df_grades[0][df_grades[0].columns[7]].mean(), 0)
std = np.nan_to_num(df_grades[0][df_grades[0].columns[7]].std(), 1)
values = np.nan_to_num(df_grades[0][df_grades[0].columns[7]].values - mean, 0)

In [77]:
g = np.sort( np.abs(values - mean) / std )

In [80]:
N = len(values)
t_crit = ss.t.ppf(0.1 / (2*N), N - 2)**2

stat = g > (N-1)*np.sqrt(t_crit / (N * (N - 2 + t_crit)))

In [82]:
stat.sum()

3380

In [83]:
N

4429

In [21]:
def get_col_with_nans(df0):
    nuls = df_grades[0].isnull().mean().to_frame().sort_values(by=0, ascending = False)
    size = np.sum(np.where(nuls > 0, 1, 0))
    return nuls[0:size]

In [23]:
col_with_nans = get_col_with_nans(df_grades[0])

In [31]:
df_grades[0].corr(method='pearson', min_periods=50)[['химшлак последний Al2O3', 'химшлак последний CaO', 
                                                     'химшлак последний R', 'химшлак последний SiO2']]

Unnamed: 0,химшлак последний Al2O3,химшлак последний CaO,химшлак последний R,химшлак последний SiO2
химшлак последний Al2O3,1.0,0.012699,0.414949,-0.448348
химшлак последний CaO,0.012699,1.0,0.416925,0.249888
химшлак последний R,0.414949,0.416925,1.0,-0.745729
химшлак последний SiO2,-0.448348,0.249888,-0.745729,1.0
