In [2]:
import pandas as pd

## Breast Cancer Data

#  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

8. Missing attribute values: 16

   There are 16 instances in Groups 1 to 6 that contain a single missing 
   (i.e., unavailable) attribute value, now denoted by "?".  

9. Class distribution:
 
   Benign: 458 (65.5%)
   Malignant: 241 (34.5%)


In [75]:
bc = pd.read_csv('breast-cancer-wisconsin.data', names = ['id','Clump Thickness','Uniformity of Size', 'Uniformity of Shape',
                                                         'Adhesion','Size','Bare Nuclei','Bland Chromatin','Norm Nucleoli',
                                                          'Mitoses', 'Class'])

In [26]:
bc.dtypes

id                      int64
Clump Thickness         int64
Uniformity of Size      int64
Uniformity of Shape     int64
Adhesion                int64
Size                    int64
Bare Nuclei            object
Bland Chromatin         int64
Norm Nucleoli           int64
Mitoses                 int64
Class                   int64
dtype: object

In [19]:
bc.describe()

Unnamed: 0,id,Clump Thickness,Uniformity of Size,Uniformity of Shape,Adhesion,Size,Bland Chromatin,Norm Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [38]:
s = set()
for col in bc:
    if col != 'id':
        print(bc[col].unique())
s

[ 5  3  6  4  8  1  2  7 10  9]
[ 1  4  8 10  2  3  7  5  6  9]
[ 1  4  8 10  2  3  5  6  7  9]
[ 1  5  3  8 10  4  6  2  9  7]
[ 2  7  3  1  6  4  5  8 10  9]
['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
[ 3  9  1  2  4  5  7  8  6 10]
[ 1  2  7  4  5  3 10  6  9  8]
[ 1  5  4  2  3  7 10  8  6]
[2 4]


set()

In [13]:
'''Do a quick search to check for NA values'''
total_na,missing = 0,0
for col in bc:
    total_na += bc[col].isna().sum()
    missing += bc[col]
total_na

0

In [58]:
bc[bc['Bare Nuclei']== '?'].count()

id                     16
Clump Thickness        16
Uniformity of Size     16
Uniformity of Shape    16
Adhesion               16
Size                   16
Bare Nuclei            16
Bland Chromatin        16
Norm Nucleoli          16
Mitoses                16
Class                  16
dtype: int64

Since the missing values constitute only about 2.2% of the data, I will first drop the rows that do not have a value for Bare Nuclei. To do this, I will first set them all to na and then dropna()

In [60]:
bc.count()

id                     699
Clump Thickness        699
Uniformity of Size     699
Uniformity of Shape    699
Adhesion               699
Size                   699
Bare Nuclei            699
Bland Chromatin        699
Norm Nucleoli          699
Mitoses                699
Class                  699
dtype: int64

In [76]:
bc = bc.drop(bc[bc['Bare Nuclei']== '?'].index)

In [77]:
bc.count()

id                     683
Clump Thickness        683
Uniformity of Size     683
Uniformity of Shape    683
Adhesion               683
Size                   683
Bare Nuclei            683
Bland Chromatin        683
Norm Nucleoli          683
Mitoses                683
Class                  683
dtype: int64

In [78]:
bc['Bare Nuclei'] =  bc['Bare Nuclei'].astype(int)

In [79]:
bc.dtypes

id                     int64
Clump Thickness        int64
Uniformity of Size     int64
Uniformity of Shape    int64
Adhesion               int64
Size                   int64
Bare Nuclei            int32
Bland Chromatin        int64
Norm Nucleoli          int64
Mitoses                int64
Class                  int64
dtype: object

In [169]:
#Full preprocessing steps from above
import pandas as pd
bc = pd.read_csv('breast-cancer-wisconsin.data', names = ['id','Clump Thickness','Uniformity of Size', 'Uniformity of Shape',
                                                         'Adhesion','Size','Bare Nuclei','Bland Chromatin','Norm Nucleoli',
                                                          'Mitoses', 'Class'])
bc = bc.drop(bc[bc['Bare Nuclei']== '?'].index)
bc['Bare Nuclei'] =  bc['Bare Nuclei'].astype(int)
#bc['Class'] = bc['Class'].astype('category')
bc = bc.reset_index()
rep = {2:0,4:1}
bc['Class'] = bc['Class'].replace(rep)

In [10]:
rep = {2:0,4:1}
bc['Class'] = bc['Class'].replace(rep)


In [263]:
def build_dummies(df):
    df_dum = pd.DataFrame()
    for i in range(1,len(df.columns)):
        temp = pd.get_dummies(df.iloc[:,i],drop_first=True,prefix = bc.columns[i])
        df_dum = pd.concat([df_dum,temp], axis = 1)
    
    return df_dum
        

In [324]:
#assume first column is index and last column is label

def winnow(df, labels, theta=5, alpha=2):
    
    '''This function access df,...
    assumes that df fields are ...'''
    
    width_df = len(df.columns)
    weights = [1]*(width_df)
    done = False
    errors = 0
    
    while not done:
        done = True
        for i in range(len(df)):
            classified = (sum([a*b for a,b in zip(weights,df.iloc[i,:])]) >= theta)
            if classified != labels[i]:
                errors += 1
                done = False
                if classified and not labels[i]: #demote
                    weights = [weights[k]/alpha if df.iloc[i,k] == 1 else weights[k] for k in range(width_df)]
                else: #promote
                    weights = [weights[k]*alpha if df.iloc[i,k] == 1 else weights[k] for k in range(width_df)]
    
    return weights,errors

In [326]:
a = build_dummies(bc)

In [327]:
w,error = winnow(a,bc['Class'])

here
here


In [333]:
def check_weights(df,w,theta):
    '''checks the weights returned by winnow model with any dataframe, assuming last column is label category'''
    df_dum = build_dummies(df)
    sums = [sum(a*b for a,b in zip(df_dum.iloc[i,:],w)) for i in range(len(df_dum))]
    classified = [i >= theta for i in sums]
    
    return all(a == b for a,b in zip(classified, df.iloc[:,-1]))
     


In [334]:
check_weights(bc,w,5)

True

In [243]:
wx = [1,1,1,.5,1,1,2,2,1]
for i in range(2):
    print(sum([a*b for a,b in zip(bc.iloc[i,2:11],wx)]) )

19.5
43.5


In [248]:
[sum(a*b for a,b in zip(bc.iloc[i,2:11],wx)) for i in range(10)]

[19.5, 43.5, 18.5, 50.5, 19.5, 82.0, 24.5, 17.5, 16.5, 17.5]

In [227]:
[a*b for a,b in zip([1,1,2,3,1,1,1,1,1],bc.iloc[0,2:11])]

[5, 1, 2, 3, 2, 1, 3, 1, 1]

In [67]:
#Dummy variable trap
a = bc.copy()
temp = pd.get_dummies(bc['Clump Thickness'], prefix = 'Clump',drop_first = True)

In [68]:
temp2 = pd.get_dummies(bc['Bare Nuclei'], drop_first = True, prefix = 'Bare')

In [97]:
a = build_dummies(bc)
a.iloc[:,0]

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      1
8      1
9      0
10     0
11     1
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
24     0
25     0
26     0
27     0
28     1
29     0
30     0
      ..
669    0
670    0
671    0
672    1
673    0
674    0
675    0
676    0
677    0
678    0
679    1
680    0
681    0
682    0
683    0
684    0
685    0
686    0
687    0
688    0
689    0
690    0
691    0
692    0
693    0
694    0
695    1
696    0
697    0
698    0
Name: Clump Thickness_2, Length: 683, dtype: uint8

In [103]:
[a*b for a,b in zip([1,1,0],[5,4,3])]

[5, 4, 0]

In [105]:
c = (True==False)
c

False

In [106]:
x = [1,2,3]

In [110]:
x = bc.Class

In [129]:
bc.reset_index()

Unnamed: 0,index,id,Clump Thickness,Uniformity of Size,Uniformity of Shape,Adhesion,Size,Bare Nuclei,Bland Chromatin,Norm Nucleoli,Mitoses,Class
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0
5,5,1017122,8,10,10,8,7,10,9,7,1,1
6,6,1018099,1,1,1,1,2,10,3,1,1,0
7,7,1018561,2,1,2,1,2,1,3,1,1,0
8,8,1033078,2,1,1,1,2,1,1,1,5,0
9,9,1033078,4,2,1,1,2,1,2,1,1,0


In [115]:
w = [1,2,1,1,5]
print([x/5 if x == 1 else x for x in w])

[0.2, 2, 0.2, 0.2, 5]


names = ['id', 'RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
Attribute Information:
   1. Id number: 1 to 214
   2. RI: refractive index
   3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as 
                  are attributes 4-10)
   4. Mg: Magnesium
   5. Al: Aluminum
   6. Si: Silicon
   7. K: Potassium
   8. Ca: Calcium
   9. Ba: Barium
  10. Fe: Iron
  11. Type of glass: (class attribute)
      -- 1 building_windows_float_processed
      -- 2 building_windows_non_float_processed
      -- 3 vehicle_windows_float_processed
      -- 4 vehicle_windows_non_float_processed (none in this database)
      -- 5 containers
      -- 6 tableware
      -- 7 headlamps

8. Missing Attribute Values: None

Summary Statistics:
Attribute:   Min     Max      Mean     SD      Correlation with class
 2. RI:       1.5112  1.5339   1.5184  0.0030  -0.1642
 3. Na:      10.73   17.38    13.4079  0.8166   0.5030
 4. Mg:       0       4.49     2.6845  1.4424  -0.7447
 5. Al:       0.29    3.5      1.4449  0.4993   0.5988
 6. Si:      69.81   75.41    72.6509  0.7745   0.1515
 7. K:        0       6.21     0.4971  0.6522  -0.0100
 8. Ca:       5.43   16.19     8.9570  1.4232   0.0007
 9. Ba:       0       3.15     0.1750  0.4972   0.5751
10. Fe:       0       0.51     0.0570  0.0974  -0.1879

9. Class Distribution: (out of 214 total instances)
    -- 163 Window glass (building windows and vehicle windows)
       -- 87 float processed  
          -- 70 building windows
          -- 17 vehicle windows
       -- 76 non-float processed
          -- 76 building windows
          -- 0 vehicle windows
    -- 51 Non-window glass
       -- 13 containers
       -- 9 tableware
       -- 29 headlamps






In [335]:
glass_df = pd.read_csv('glass.data',names = ['id', 'RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type'])

In [336]:
glass_df.describe()

Unnamed: 0,id,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,107.5,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,61.920648,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.0,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,54.25,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,107.5,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,160.75,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,214.0,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [341]:
glass_df['RI'].max() - glass_df['RI'].min()

0.022780000000000022

In [None]:
def discretize_values(df):
    