In [12]:
import pipelines_classes as our_ppl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
import scipy.stats as stats
import json
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from functools import reduce
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import boxcox
from sklearn.model_selection import cross_validate
import category_encoders as ce
import math
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import clone

In [2]:
other_train = pd.read_csv('31/other_train.csv')
other_valid = pd.read_csv('31/other_valid.csv')

personal_train = pd.read_csv('31/personal_train.csv')
personal_valid = pd.read_csv('31/personal_valid.csv')

In [4]:
result_train = our_ppl.merge_and_deduplicate(personal_train, other_train, columns=['name','address'],deduplic=[1], func=our_ppl.func)

In [5]:
result_train.shape

(2237, 38)

In [13]:
true_false_cols = ['on antithyroid medication','query hyperthyroid','FTI measured',
                   'lithium','tumor','sick','TT4 measured','goitre','hypopituitary',
                   'TBG measured','TSH measured', 'T3 measured', 'on thyroxine',
                   'query on thyroxine', 'psych', 'I131 treatment', 'query hypothyroid',
                   'T4U measured', 'thyroid surgery', 'pregnant']

numeric_columns = ['TT4', 'T4U','TSH', 'T3', 'FTI','age']

all_numeric_columns = ['FTI', 'T3', 'T4U',  'TSH', 'TT4', 'age', 'capital-gain',
                       'capital-loss', 'education-num', 'fnlwgt', 'hours-per-week']

categorical_columns = ['occupation','sex','workclass', 'tumor']

all_categorical_columns = ['marital-status','education','native-country','occupation',
                       'race','referral source','relationship','sex','sick','tumor','workclass',
                       'pregnant',	'thyroid surgery', 'I131 treatment',  'psych', 'query hypothyroid']


encoder = ce.OneHotEncoder()
model_numeric = LinearRegression()
model_categorical = RandomForestClassifier(n_estimators = 100,
                    max_depth = 12, random_state = 8)


ppl = Pipeline([
    
    ('ExtraxtMedicalInfo',our_ppl.Extraction('medical_info')),
    ('AllNumObjectstoNumeric', our_ppl.ObjectToNumeric(all_numeric_columns)),
    
    
    ('ReplaceFalseCorrect',our_ppl.StringReplacer(true_false_cols,'f','f')),
    ('ReplaceTrueCorrect',our_ppl.StringReplacer(true_false_cols,'t','t')),
    
    ('ReplaceClassNegative',our_ppl.StringReplacer(['class'],'negativ','negative')),
    ('ReplaceClassIncreased',our_ppl.StringReplacer(['class'],'increase','increased')),
    ('ReplaceClassDecreased',our_ppl.StringReplacer(['class'],'decrease','decreased')),
    
    ('FindNans', our_ppl.StringReplacer(all_categorical_columns, '?', np.NaN)),
    
    # normalizacia
    ('Ln', our_ppl.LogNormalization(numeric_columns)),
    # odstranenie outlierov
    ('ReplaceExtremes',our_ppl.ReplaceOutliersWithPercentile(numeric_columns)),
    # doplnenie prazdnych hodnot modelmi
    ('ReplaceNumericNansWithModel',our_ppl.ReplaceNanWithModel(numeric_columns, LinearRegression(), numeric_columns)),
    ('ReplaceCategoricalNansWithModel', our_ppl.ReplaceCategoryNanWithModel(['occupation','sex','workclass','tumor'],
                                                               ['occupation','sex','workclass', 'tumor', 'marital-status'],
                                                                model_categorical, encoder)),
    # doplnenie prazdnych hodnot v pripade velkeho poctu kategorii, ktore predpokladame, ze z pohladu analyzy nezohravaju az taku dolezitu ulohu
    ('ReplaceWithMostFrequent', our_ppl.ReplaceMostFrequent(col_names=['native-country','FTI measured', 'relationship',
                                                               'query hypothyroid','hours-per-week']))
    
])

model =ppl.fit(result_train)
X_train = ppl.transform(result_train)

-- Extraction of  medical_info
-- Transform object to numeric for:  ['FTI', 'T3', 'T4U', 'TSH', 'TT4', 'age', 'capital-gain', 'capital-loss', 'education-num', 'fnlwgt', 'hours-per-week']
-- Replacing: ** f ** to:  f  , for:  ['on antithyroid medication', 'query hyperthyroid', 'FTI measured', 'lithium', 'tumor', 'sick', 'TT4 measured', 'goitre', 'hypopituitary', 'TBG measured', 'TSH measured', 'T3 measured', 'on thyroxine', 'query on thyroxine', 'psych', 'I131 treatment', 'query hypothyroid', 'T4U measured', 'thyroid surgery', 'pregnant']
-- Replacing: ** t ** to:  t  , for:  ['on antithyroid medication', 'query hyperthyroid', 'FTI measured', 'lithium', 'tumor', 'sick', 'TT4 measured', 'goitre', 'hypopituitary', 'TBG measured', 'TSH measured', 'T3 measured', 'on thyroxine', 'query on thyroxine', 'psych', 'I131 treatment', 'query hypothyroid', 'T4U measured', 'thyroid surgery', 'pregnant']
-- Replacing: ** negativ ** to:  negative  , for:  ['class']
-- Replacing: ** increase ** to:  incr



pocet hodnot: 2000/2237, workclass(accuracy score): 0.749998054226
pocet hodnot: 2000/2237, tumor(accuracy score): 0.975001128132
transform  ['occupation', 'sex', 'workclass', 'tumor']
pocet predikovanych pre  occupation :  151
pocet predikovanych pre  sex :  88
pocet predikovanych pre  workclass :  152
pocet predikovanych pre  tumor :  1
transform  ['TT4', 'T4U', 'TSH', 'T3', 'FTI', 'age']
pocet predikovanych pre  TT4 :  360
pocet predikovanych pre  T4U :  235
pocet predikovanych pre  TSH :  232
pocet predikovanych pre  T3 :  468
pocet predikovanych pre  FTI :  235
pocet predikovanych pre  age :  1
transform  ['occupation', 'sex', 'workclass', 'tumor']
pocet predikovanych pre  occupation :  151
pocet predikovanych pre  sex :  88
pocet predikovanych pre  workclass :  152
pocet predikovanych pre  tumor :  1


Index(['Unnamed: 0_x', 'name', 'address', 'age', 'sex', 'date_of_birth', 'FTI',
       'FTI measured', 'T3', 'T4U', 'TBG', 'TBG measured', 'TSH', 'TT4',
       'TT4 measured', 'Unnamed: 0_y', 'capital-gain', 'capital-loss', 'class',
       'education', 'education-num', 'fnlwgt', 'goitre', 'hours-per-week',
       'hypopituitary', 'lithium', 'marital-status', 'medical_info',
       'native-country', 'occupation', 'on antithyroid medication',
       'query hyperthyroid', 'race', 'referral source', 'relationship', 'sick',
       'tumor', 'workclass', 'query hypothyroid', 'T4U measured', 'pregnant',
       'thyroid surgery', 'TSH measured', 'query on thyroxine',
       'I131 treatment', 'on thyroxine', 'T3 measured', 'psych'],
      dtype='object')

-- Replacing: ** f ** to:  f  , for:  ['TBG measured']


Unnamed: 0.1,Unnamed: 0,name,address,query hyperthyroid,FTI measured,education,lithium,TT4,T4U,capital-loss,...,hypopituitary,medical_info,on antithyroid medication,referral source,education-num,occupation,TBG measured,TBG,race,FTI
0,0,Christine Tanner,"340 Moon Freeway\nTamarafort, MO 35449",f,t,HS-grad,f,84.0,0.87,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,SVI,9.0,Adm-clerical,f,?,White,97
1,1,Cynthia Raio,"446 Lynch Prairie Apt. 742\nKristineshire, CO ...",f,t,HS-grad,f,128.0,1.14,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,9.0,Adm-clerical,f,?,Black,112
2,2,Jason Muller,"198 Garcia Stravenue Apt. 769\nEast Shawnview,...",f,t,HS-grad,f,114.0,1.02,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,SVHC,9.0,Handlers-cleaners,f,?,White,112
3,3,Sharon Tomassi,"1695 Judy Burg\nEast Kurtland, MI 54744",f,t,Some-college,f,91.0,1.16,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,10.0,Sales,f,?,Black,79
4,4,Christopher Sells,Unit 5029 Box 6752\nDPO AE 90819,f,t,HS-grad,f,83.0,0.82,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,SVI,9.0,?,f,?,White,101
5,5,Michelle Zayas,"3984 Cardenas Ridges\nSouth David, HI 89301",f,f,HS-grad,f,,,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'f','p...",f,other,9.0,Other-service,f,?,Black,?
6,6,Serena Jones,Unit 5665 Box 2315\nDPO AP 64211,f,t,Bachelors,f,81.0,1.16,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,13.0,Exec-managerial,f,?,White,70
7,7,Kimberly Seacat,"346 Baker Forks\nEast Williamtown, SD 11575",f,t,HS-grad,f,201.0,1.08,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,9.0,Exec-managerial,f,?,White,186
8,8,Frank Gerace,Unit 4662 Box 7515\nDPO AP 48074,f,t,1st-4th,f,107.0,1.06,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,2.0,Farming-fishing,f,?,White,101
9,9,Taryn Medina,"46320 Cassie Trail Suite 868\nMatthewville, WV...",f,t,Bachelors,f,116.0,1.06,0.0,...,f,"{'query hypothyroid':'f','T4U measured':'t','p...",f,other,13.0,Prof-specialty,f,?,White,109


In [None]:
def manual_features():
    for 
    