## Handling Missing Attributes and Rule Induction on Iris Dataset !!

In [1]:
import time
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
from pandas.api.types import is_numeric_dtype
from functools import reduce

In [2]:
df = pd.read_csv('../data/Iris/Iris-5-lost.csv')

In [3]:
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,?,3,1.4,?,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5,?,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,?,Iris-setosa
9,4.9,3.1,?,0.1,Iris-setosa


### Filling up the missing attributes with Lost Value Interpretation !!

#### Defining Goal Set !!

In [4]:
df_headers = list(df)
concept = df_headers[-1]
concept

'class'

In [5]:
#all unique concepts
concept_list = df[concept].unique()
concept_list

array(['Iris-setosa', 'Iris-versicolor', 'Iris-viginica'], dtype=object)

In [6]:
#calculating cases by concepts and making sets
U = [] #universal list containing all cases
temp_list = []
goal_list = []
for item in concept_list:
    for index, row in df.iterrows():
        U.append(index+1)
        if row[concept] == item:
            temp_list.append(index)
    goal_list.append(temp_list)
    temp_list = []

In [7]:
print(goal_list,)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]]


#### Building Case List !!

In [8]:
attributes = list(df)
attributes

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [9]:
case_list = []

In [10]:
def discretize(numeric_col):
    print(numeric_col)
    #Sorting the values of numeric column
    sort_col = df[numeric_col].sort_values()
   
    df['sort_col'] = sort_col.values
    point_list = df['sort_col'].unique()
    point_list = list(point_list)
    point_list.remove('?')
    print(point_list)
    
    #Finding average between each two points
    avg_list = []
    for i in range(len(point_list)-1):
        avg = (float(point_list[i]) + float(point_list[i+1]))/2
        avg_list.append(round(float(avg),1))
        
    print (avg_list)
    #Performing the discretization and adding the cases
    for i in avg_list:
        case = str(numeric_col) + "," + str(round(float(point_list[0]),1)) + ".." + str(i)
        case2 = str(numeric_col) + "," + str(i) + ".." + str(round(float(point_list[len(point_list)-1]),1))
        case_list.append(case)
        case_list.append(case2)

In [11]:
#Discretization considering upto 2 decimal point
for item in attributes[:-1]:
        discretize(item)

sepal_length
['4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '5', '5.1', '5.2', '5.3', '5.4', '5.5', '5.6', '5.7', '5.8', '5.9', '6', '6.1', '6.2', '6.3', '6.4', '6.5', '6.6', '6.7', '6.8', '6.9', '7', '7.1', '7.2', '7.3', '7.4', '7.6', '7.7', '7.9']
[4.3, 4.5, 4.5, 4.7, 4.8, 4.8, 5.0, 5.0, 5.2, 5.2, 5.3, 5.5, 5.5, 5.7, 5.8, 5.8, 6.0, 6.0, 6.2, 6.2, 6.3, 6.5, 6.5, 6.7, 6.8, 6.8, 7.0, 7.0, 7.2, 7.2, 7.3, 7.5, 7.7, 7.8]
sepal_width
['2', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9', '3', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '3.9', '4', '4.1', '4.2', '4.4']
[2.1, 2.2, 2.3, 2.5, 2.5, 2.7, 2.8, 2.8, 3.0, 3.0, 3.2, 3.2, 3.3, 3.5, 3.5, 3.7, 3.8, 3.8, 4.0, 4.0, 4.2, 4.3]
petal_length
['1', '1.1', '1.2', '1.3', '1.4', '1.5', '1.6', '1.7', '1.9', '3', '3.3', '3.5', '3.6', '3.7', '3.8', '3.9', '4', '4.1', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '5', '5.1', '5.2', '5.3', '5.4', '5.5', '5.6', '5.7', '5.8', '5.9', '6', '6.1', '6.3', '6.4', '6.6', '6.7',

In [12]:
case_list

['sepal_length,4.3..4.3',
 'sepal_length,4.3..7.9',
 'sepal_length,4.3..4.5',
 'sepal_length,4.5..7.9',
 'sepal_length,4.3..4.5',
 'sepal_length,4.5..7.9',
 'sepal_length,4.3..4.7',
 'sepal_length,4.7..7.9',
 'sepal_length,4.3..4.8',
 'sepal_length,4.8..7.9',
 'sepal_length,4.3..4.8',
 'sepal_length,4.8..7.9',
 'sepal_length,4.3..5.0',
 'sepal_length,5.0..7.9',
 'sepal_length,4.3..5.0',
 'sepal_length,5.0..7.9',
 'sepal_length,4.3..5.2',
 'sepal_length,5.2..7.9',
 'sepal_length,4.3..5.2',
 'sepal_length,5.2..7.9',
 'sepal_length,4.3..5.3',
 'sepal_length,5.3..7.9',
 'sepal_length,4.3..5.5',
 'sepal_length,5.5..7.9',
 'sepal_length,4.3..5.5',
 'sepal_length,5.5..7.9',
 'sepal_length,4.3..5.7',
 'sepal_length,5.7..7.9',
 'sepal_length,4.3..5.8',
 'sepal_length,5.8..7.9',
 'sepal_length,4.3..5.8',
 'sepal_length,5.8..7.9',
 'sepal_length,4.3..6.0',
 'sepal_length,6.0..7.9',
 'sepal_length,4.3..6.0',
 'sepal_length,6.0..7.9',
 'sepal_length,4.3..6.2',
 'sepal_length,6.2..7.9',
 'sepal_leng

#### Handle Missing Attributes !!