# IMPORT MODULES

In [66]:
import pandas as pd

# CATEGORIES CLASS

In [67]:
class Categories:
 
    def __init__(self, _name, _index):
        self.name = _name
        self.index = _index
        self.prob = 0
        self.entropy = 0

# FEATURES CLASS

In [72]:
class Features:
    def __init__(self, _name, _feat_type):
        self.name = _name
        self.feat_type = _feat_type
        self.num_cat = 0
        self.categories = dict()
        self.entropy = 0
        self.std_entropy = 0
        self.reliability = 0
        self.weight = 0
        self.distances = dict()

# ORDINAL DATA

In [69]:
def read_ord(data):
    try:
        ordinal_categories = pd.read_csv(data,header=None,index_col=0)
    except ImportError:
        print('Error Importing Ordinal Data')

    features = dict()
    
    for feat in ordinal_categories.index:
        feat_instance = Features(feat,'ordinal')
        
        cat_names = ordinal_categories.loc[feat,ordinal_categories.loc[feat,:].notnull()].tolist()
        feat_instance.num_cat = len(cat_names)
        
        for _index, _name in enumerate(cat_names):
            feat_instance.categories[_name] = Categories(_name, _index)
           
        features[feat] = feat_instance
        
    return features

# NOMINAL DATA

In [70]:
def read_norm(data):
    try:
        nominal_categories = pd.read_csv(data,header=None,index_col=0)
    except ImportError:
        print('Error Importing Nominal Data')

    features = dict()
    
    for feat in nominal_categories.index:
        feat_instance = Features(feat,'nominal')
        
        cat_names = nominal_categories.loc[feat,nominal_categories.loc[feat,:].notnull()].tolist()
        feat_instance.num_cat = len(cat_names)
        
        for _index, _name in enumerate(cat_names):
            feat_instance.categories[_name] = Categories(_name, _index)
           
        features[feat] = feat_instance
        
    return features

# TEST READ ORD AND NOM

In [73]:
feat = read_ord('Preprocessed_Ordinal.csv')

In [74]:
for f in feat.keys():
    print('name:',f)
    print('type:',feat[f].feat_type)
    print('num_cat:',feat[f].num_cat)
    print()
    for cat in feat[f].categories.keys():
        print('    index:',feat[f].categories[cat].index)
        print('    name:',feat[f].categories[cat].name)
        print()
    print()

name: achievingends_quality
type: ordinal
num_cat: 3

    index: 0
    name: poor

    index: 1
    name: medium

    index: 2
    name: good


name: anger_quality
type: ordinal
num_cat: 3

    index: 0
    name: poor

    index: 1
    name: medium

    index: 2
    name: good


name: anger_speed
type: ordinal
num_cat: 4

    index: 0
    name: slowly

    index: 1
    name: moderately

    index: 2
    name: quickly

    index: 3
    name: variably


name: appetite_amount
type: ordinal
num_cat: 4

    index: 0
    name: low

    index: 1
    name: medium

    index: 2
    name: high

    index: 3
    name: variable


name: bladder_amount
type: ordinal
num_cat: 4

    index: 0
    name: low

    index: 1
    name: medium

    index: 2
    name: high

    index: 3
    name: variable


name: body_odour
type: ordinal
num_cat: 3

    index: 0
    name: veryless

    index: 1
    name: mild

    index: 2
    name: strong


name: bodybuild_size
type: ordinal
num_cat: 3

    index: 0
    name

## Verdict: Working

Can also try running for nominal here if you want.


# READ POPULATION

In [82]:
 df = pd.read_csv('Preprocessed_Dataset.csv', index_col=0)

In [83]:
df.drop('prakriti',axis=1,inplace=True)

# run this only once otherwise you'll get errors

In [84]:
df.head()

Unnamed: 0_level_0,achievingends_quality,anger_quality,anger_speed,appetite_amount,appetite_frequency,bladder_amount,bladder_frequency,body_odour,bodybuild_size,bodyframe_breadth,...,working_quality,working_speed,working_style,skin_clear,skin_cracked,skin_freckle,skin_mark,skin_mole,skin_pimple,skin_wrinkled
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V091206280,good,medium,slowly,medium,regular,medium,regular,mild,moderatelydeveloped,medium,...,sharp/accurate/spontaneous,medium,sharp/accurate,yes,no,no,no,no,no,no
V091201522,poor,good,quickly,medium,regular,medium,regular,veryless,welldeveloped,broad,...,wellthoughtof,slow,firm/steady,no,yes,no,no,no,yes,no
V091206079,poor,good,slowly,medium,regular,high,regular,strong,weaklydeveloped,thin/narrow,...,wavering/easilydeviated,quick/fast/brisk,unsteady,no,no,no,yes,yes,no,no
V091202231,medium,medium,variably,high,regular,medium,regular,strong,moderatelydeveloped,medium,...,sharp/accurate/spontaneous,quick/fast/brisk,sharp/accurate,no,no,no,yes,no,no,no
V091209882,poor,poor,quickly,variable,irregular,medium,regular,mild,moderatelydeveloped,medium,...,sharp/accurate/spontaneous,medium,sharp/accurate,no,no,yes,yes,no,no,no


In [85]:
def probability(df, features):
    for feat in features.keys():
        series = df[feat].value_counts(normalize=True,sort=False)
        for cat in series.index:
            features[feat].categories[cat].prob = series[cat]

    return features

In [86]:
feat = probability(df,feat)

In [88]:
for f in feat.keys():
    print(f)
    print(feat[f].feat_type,feat[f].num_cat)
    for cat in feat[f].categories.keys():
        print(feat[f].categories[cat].index,feat[f].categories[cat].name,feat[f].categories[cat].prob)
    print()

achievingends_quality
ordinal 3
0 poor 0.1984126984126984
1 medium 0.45634920634920634
2 good 0.34523809523809523

anger_quality
ordinal 3
0 poor 0.192
1 medium 0.572
2 good 0.236

anger_speed
ordinal 4
0 slowly 0.18326693227091634
1 moderately 0.3187250996015936
2 quickly 0.47808764940239046
3 variably 0.0199203187250996

appetite_amount
ordinal 4
0 low 0.17391304347826086
1 medium 0.6521739130434783
2 high 0.11462450592885376
3 variable 0.05928853754940711

bladder_amount
ordinal 4
0 low 0.012195121951219513
1 medium 0.8780487804878049
2 high 0.07317073170731707
3 variable 0.036585365853658534

body_odour
ordinal 3
0 veryless 0.44621513944223107
1 mild 0.44621513944223107
2 strong 0.10756972111553785

bodybuild_size
ordinal 3
0 weaklydeveloped 0.2964426877470356
1 moderatelydeveloped 0.44664031620553357
2 welldeveloped 0.25691699604743085

bodyframe_breadth
ordinal 3
0 thin/narrow 0.3531746031746032
1 medium 0.39285714285714285
2 broad 0.25396825396825395

bodyframe_length
ordinal 3


In [89]:
def entropy(features):
    for feat in features.values():
        for cat in feat.categories.values():
            if(cat.prob):
                cat.entropy = (-1)*cat.prob*math.log(cat.prob)
            else:
                pass

    return features

In [90]:
import math
feat = entropy(feat)

In [91]:
for f in feat.keys():
    print(f)
    print(feat[f].feat_type,feat[f].num_cat)
    for cat in feat[f].categories.keys():
        print(feat[f].categories[cat].index,feat[f].categories[cat].name,feat[f].categories[cat].prob,feat[f].categories[cat].entropy)
    print()

achievingends_quality
ordinal 3
0 poor 0.1984126984126984 0.3209139051752534
1 medium 0.45634920634920634 0.35800456469063463
2 good 0.34523809523809523 0.3671679535339089

anger_quality
ordinal 3
0 poor 0.192 0.3168499021352363
1 medium 0.572 0.31952851650853803
2 good 0.236 0.3407659398537404

anger_speed
ordinal 4
0 slowly 0.18326693227091634 0.3109694460620067
1 moderately 0.3187250996015936 0.364438662775427
2 quickly 0.47808764940239046 0.3528101337130221
3 variably 0.0199203187250996 0.07800826746409728

appetite_amount
ordinal 4
0 low 0.17391304347826086 0.30420867040161026
1 medium 0.6521739130434783 0.2787678357566998
2 high 0.11462450592885376 0.24828741542881558
3 variable 0.05928853754940711 0.16751023444418836

bladder_amount
ordinal 4
0 low 0.012195121951219513 0.053740478625173824
1 medium 0.8780487804878049 0.11419299065695415
2 high 0.07317073170731707 0.19133852034411206
3 variable 0.036585365853658534 0.12102830336327353

body_odour
ordinal 3
0 veryless 0.4462151394

In [92]:
def probability(df, features):
    for feat in features.keys():
        series = df[feat].value_counts(normalize=True,sort=False)
        for cat in series.index:
            features[feat].categories[cat].prob = series[cat]

    return features

def entropy(features):
    for feat in features.values():
        for cat in feat.categories.values():
            if(cat.prob):
                cat.entropy = (-1)*cat.prob*math.log(cat.prob)
            else:
                pass

    return features


def std_entropy(features):
    for feat in features.values():
        if(feat.num_cat):
            feat.std_entropy = math.log(feat.num_cat)
        else:
            pass

    return features

def feature_entropy(features):
    for feat in features.values():
        feat_entropy = 0
        for cat in feat.categories.values():
            feat_entropy = feat_entropy + cat.entropy

        feat.entropy = feat_entropy

    return features

def reliability(features):
    for feat in features.values():
        try:
            feat.reliability = feat.entropy/ feat.std_entropy
        except ZeroDivisionError as error:
            Logging.log_exception(error)

    return features

In [93]:
feat = std_entropy(feat)

In [94]:
feat = feature_entropy(feat)

In [95]:
feat = reliability(feat)

In [96]:
for f in feat.keys():
    print('name:',f)
    print('type:',feat[f].feat_type)
    print('num_cat:',feat[f].num_cat)
    print('entropy:',feat[f].entropy)
    print('std_entropy:',feat[f].std_entropy)
    print('reliability:',feat[f].reliability)
    print()
    for cat in feat[f].categories.keys():
        print('    index:',feat[f].categories[cat].index)
        print('    name:',feat[f].categories[cat].name)
        print('    prob:',feat[f].categories[cat].prob)
        print('    entropy:',feat[f].categories[cat].entropy)
        print()
    print()

name: achievingends_quality
type: ordinal
num_cat: 3
entropy: 1.046086423399797
std_entropy: 1.0986122886681098
reliability: 0.9521888970202655

    index: 0
    name: poor
    prob: 0.1984126984126984
    entropy: 0.3209139051752534

    index: 1
    name: medium
    prob: 0.45634920634920634
    entropy: 0.35800456469063463

    index: 2
    name: good
    prob: 0.34523809523809523
    entropy: 0.3671679535339089


name: anger_quality
type: ordinal
num_cat: 3
entropy: 0.9771443584975147
std_entropy: 1.0986122886681098
reliability: 0.8894351251815549

    index: 0
    name: poor
    prob: 0.192
    entropy: 0.3168499021352363

    index: 1
    name: medium
    prob: 0.572
    entropy: 0.31952851650853803

    index: 2
    name: good
    prob: 0.236
    entropy: 0.3407659398537404


name: anger_speed
type: ordinal
num_cat: 4
entropy: 1.1062265100145532
std_entropy: 1.3862943611198906
reliability: 0.7979737500489505

    index: 0
    name: slowly
    prob: 0.18326693227091634
    entrop

    entropy: 0.13669393635652213

    index: 2
    name: high/excessive
    prob: 0.003968253968253968
    entropy: 0.021942178918696122


name: joint_size
type: ordinal
num_cat: 3
entropy: 0.9219287926832922
std_entropy: 1.0986122886681098
reliability: 0.8391757512570537

    index: 0
    name: weaklydeveloped
    prob: 0.2
    entropy: 0.3218875824868201

    index: 1
    name: moderatelydeveloped
    prob: 0.624
    entropy: 0.29428146422233065

    index: 2
    name: welldeveloped
    prob: 0.176
    entropy: 0.3057597459741414


name: leg_length
type: ordinal
num_cat: 3
entropy: 0.39419951849000334
std_entropy: 1.0986122886681098
reliability: 0.3588158648470123

    index: 0
    name: tooshort/toolong
    prob: 0.011857707509881422
    entropy: 0.05258629090979538

    index: 1
    name: medium
    prob: 0.8853754940711462
    entropy: 0.10778865557089173

    index: 2
    name: long
    prob: 0.10276679841897234
    entropy: 0.2338245720093162


name: legs_movements
type: ordinal

    prob: 0.11462450592885376
    entropy: 0.24828741542881558

    index: 1
    name: moderate
    prob: 0.6521739130434783
    entropy: 0.2787678357566998

    index: 2
    name: excessive
    prob: 0.233201581027668
    entropy: 0.33950699859480726


name: speaking_speed
type: ordinal
num_cat: 4
entropy: 0.8051470943624687
std_entropy: 1.3862943611198906
reliability: 0.580790860111446

    index: 0
    name: slow
    prob: 0.11904761904761904
    entropy: 0.2533609173630081

    index: 1
    name: medium
    prob: 0.7182539682539683
    entropy: 0.23769326262084586

    index: 2
    name: quick
    prob: 0.15873015873015872
    entropy: 0.2921507354599186

    index: 3
    name: variable
    prob: 0.003968253968253968
    entropy: 0.021942178918696122


name: stool_consistency
type: ordinal
num_cat: 3
entropy: 0.5628243008646902
std_entropy: 1.0986122886681098
reliability: 0.5123047563458659

    index: 0
    name: loose/soft/semisolid
    prob: 0.06374501992031872
    entropy: 0.17

## Verdict: Working

Now work on distance in file distance.py. You may start from scratch