In [1]:
import random
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm

In [2]:
def test_train_split(df,fraction, discretize = False, n_bins = 10):
    
    num = math.ceil(len(df)*fraction)
        
    if discretize:
        y_data = df.iloc[:,len(df.columns)-1]
        X_data = df.iloc[:,:-1] #remove class column
        
        scaler = StandardScaler(copy=True)
        Xt = scaler.fit_transform(X_data)
        est = KBinsDiscretizer(n_bins, encode ='onehot-dense',strategy='quantile')
        Xt1 = est.fit_transform(Xt)
        Xt2 = pd.DataFrame(Xt1)
        
        Xt2.insert(len(Xt2.columns),'Class',y_data)
        train = Xt2.sample(n = num, replace = False)
        test = Xt2[~Xt2.index.isin(train.index)]
        train = train.reset_index(drop=True,inplace=False)
        test = test.reset_index(drop=True,inplace = False)
        
        X_train = train.iloc[:,0:-1]
        y_train = train.iloc[:,-1]
        X_test = test.iloc[:,0:-1]
        y_test = test.iloc[:,-1]

        return X_train,y_train,X_test,y_test
    
    else:
        
        train = df.sample(n = num, replace = False)
        test = df[~df.index.isin(train.index)]
        train = train.reset_index(drop=True,inplace=False)
        test = test.reset_index(drop=True,inplace = False)
        
        X_train = train.iloc[:,0:-1]
        y_train = train.iloc[:,-1]
        X_test = test.iloc[:,0:-1]
        y_test = test.iloc[:,-1]
   
    return X_train,y_train,X_test,y_test

In [3]:
def build_dummies(df):
    if df.columns[0] == 'id':
        df = df.iloc[:,1:]
    else:
        df = df.iloc[:,0:]
    df_dum = pd.DataFrame()
    for i in range(len(df.columns)):
        temp = pd.get_dummies(df.iloc[:,i],drop_first=False,prefix = df.columns[i])
        df_dum = pd.concat([df_dum,temp], axis = 1)
    
    return df_dum

In [4]:
def confusion(y_act,y_pred, margins = False):
    #y_act = pd.Categorical(y_test.astype(bool))
    y_pred = pd.Categorical(y_pred)
    df_conf = pd.crosstab(y_act, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=margins)

    return df_conf

In [5]:
def class_prob(classdata):
    class_totals = defaultdict(int)
    class_prob = defaultdict(int)
    #listdf = df.values.tolist()
    total = len(classdata)
    for i in classdata:
        class_totals[i] += 1
    for key in class_totals:
        class_prob[key] = class_totals[key]/total
    return class_totals, class_prob

In [6]:
def splitByClass(X,y):
    classData = defaultdict(list)    
    listdf = X.values.tolist()
    for i in range(len(listdf)):
        row = listdf[i]
        classData[y[i]].append(row)
    return classData

In [7]:
def feature_stats(X):
    '''results verified with excel'''
    feature_data = defaultdict(list)
    features = X.values.tolist()
    stats = [(np.mean(feature), np.var(feature)) for feature in zip(*features)]
    for i in range(len(X.columns)):
        feature_data[X.columns[i]] = stats[i]
    
    return feature_data

In [8]:
def feature_stats_byClass(classdict):
    '''results verified with excel'''
    features_byClass = defaultdict(dict)
    for key in classdict:
        classmean = [np.mean(feature) for feature in zip(*classdict[key])]
        classvar = [np.var(feature,dtype=np.float64) for feature in zip(*classdict[key])]
        features_byClass[key] = {"mean":classmean,"var":classvar}
    
    return features_byClass

In [9]:
def prob_calc(xval,xmean,xvariance):
    '''assume iid, normally distributed sampling distribution'''
    return norm.pdf(xval,xmean,xvariance) + 1/100

In [59]:
def NB_predict(test, fstats,by_class,classprob):
    '''given data, looking for P(class | data) using the trained data from from feature values'''
    #which class has highest probability given the data?
    classes = classprob.keys()
    post = {x:-1 for x in range(len(test))}
    preds = [-1]*len(test)
    for i in range(len(test)):
        for j in classes:
            prob_class = classprob[j]
            maxprob = 0
            likelihood = 1
            for k in range(len(test.columns)):
                likelihood = likelihood * (prob_calc(test.iloc[i,k]+.1,by_class[j]['mean'][k]+.1,by_class[j]['var'][k]+.1))

            posterior = likelihood * prob_class

            if posterior > post[i]:                
                post[i] = posterior
                preds[i] = j
                
    return preds

In [60]:
def NB_train(X,y):
    fstats = feature_stats(X)
    by_class = feature_stats_byClass(splitByClass(X,y))
    classtot,classprob = class_prob(y)
    #glass_nb = NB_predict(X,fstats,by_class,classprob)
    return fstats,by_class,classprob

In [78]:
X_train.shape

(144, 9)

In [69]:
def NB_error(preds, labels):
    error = [1 if preds[i] != labels[i] else 0 for i in range(len(preds))]
    return sum(error)

In [None]:
def forwardSelection(X, y):
    """
    Forward selection is a feature selection algorithm that uses my custom NB error function
    
    Parameters
    ----------
    X : Independent variables (Pandas Dataframe)\n
    y : Dependent variable (Pandas Series, Pandas Dataframe)\n
    
    Returns
    -------
    array of best features
    """

    return __forwardSelectionRaw__(X, y)

In [178]:
def BF_forwardSelection(df):
    
    min_colsize = 30
    min_err = len(df)
    X_train,y_train,X_test,y_test = test_train_split(df,.67)
    cols = X_train.columns.tolist()
    chosen_cols = ''
    
    #the if below allows us to discern between larger feature sets and take some other action
    #besides simply taking all combinations of columns    
 
    for i in range(1,len(df.columns)):
        i_sizecols = combinations(cols,i)
        for j in i_sizecols:
            df_train = X_train[list(j)]
            f,byc,cprob = NB_train(df_train,y_train)
            preds = NB_predict(X_test[list(j)],f,byc,cprob)
            err = NB_error(preds,y_test)
            if err <= min_err:
                print(err,min_err,list(j))
                chosen_cols = j
                min_err = err
    return chosen_cols


In [179]:
BF_forwardSelection(glass_df)

45 214 ['RI']
41 45 ['Na']
41 41 ['Mg']
30 41 ['Al']
30 30 ['RI', 'Al']
30 30 ['Al', 'Ca']
28 30 ['Al', 'Ba']
28 28 ['RI', 'Al', 'Ba']


('RI', 'Al', 'Ba')

In [204]:
def forwardSelection(df):
    
    min_colsize = 30
    min_err = len(df)
    X_train,y_train,X_test,y_test = test_train_split(df,.67)
    cols = X_train.columns.tolist()
    chosen_cols = ''
    
    #the if below allows us to discern between larger feature sets and take some other action
    #besides simply taking all combinations of columns    
 
    for i in range(1,len(df.columns)):

        for j in range(i,len(df.columns)-1):
            df_train = pd.concat([X_train.ix[:,0:i],X_train.ix[:,j]], axis=1)
            f,byc,cprob = NB_train(df_train,y_train)
            df_test = pd.concat([X_test.ix[:,0:i],X_test.ix[:,j]], axis=1)
            preds = NB_predict(df_test,f,byc,cprob)
            err = NB_error(preds,y_test)
            if err <= min_err:
                print(err,min_err,df_train.columns.tolist())
                chosen_cols = df_train.columns.tolist()
                min_err = err
    return chosen_cols

In [205]:
forwardSelection(iris_df)

18 150 ['slength', 'swidth']
9 18 ['slength', 'plength']
4 9 ['slength', 'pwidth']
3 4 ['slength', 'swidth', 'pwidth']
2 3 ['slength', 'swidth', 'plength', 'pwidth']


['slength', 'swidth', 'plength', 'pwidth']

In [189]:
a = X_train.iloc[:,2]
a.head()

0    3.48
1    0.00
2    3.73
3    2.81
4    3.58
Name: Mg, dtype: float64

In [188]:
a = X_train.iloc[:,0:3]
a.head()

Unnamed: 0,RI,Na,Mg
0,1.51824,12.87,3.48
1,1.51651,14.38,0.0
2,1.51911,13.9,3.73
3,1.51778,13.21,2.81
4,1.5159,13.02,3.58


In [186]:
X_train.ix[:,0:1]

Unnamed: 0,RI
0,1.51824
1,1.51651
2,1.51911
3,1.51778
4,1.51590
5,1.51841
6,1.51567
7,1.51844
8,1.51743
9,1.51627


In [104]:
def SFS():
    return

In [12]:
def GAS():
    return

In [13]:
def kMeans():
    return

In [14]:
def HAC():
    return

In [15]:
def calc_class_from_cluster():
    return

In [16]:
def sCoef():
    return

In [17]:
def inter_cluster_distance():
    return

In [18]:
def intra_cluster_distance():
    return

In [19]:
def data_sCoef():
    return

In [49]:
#Glass dataset
glass_df = pd.read_csv('Datasets\glass.data',names = ['id', 'RI','Na','Mg','Al',
                                             'Si','K','Ca','Ba','Fe',
                                             'Class'])
glass_df = glass_df.iloc[:,1:]
glass_df['Class'] = glass_df['Class'].astype("category").cat.codes
X_train,y_train,X_test,y_test = test_train_split(glass_df,.67)

In [74]:
f,bc,cp = NB_train(X_train,y_train)
x = NB_predict(X_test,f,bc,cp)
err = NB_error(x,y_test)

In [22]:
iris_df = pd.read_csv('Datasets\iris.data', names = ['slength','swidth','plength','pwidth','Class'])

#change cateogires as follows:
#Iris-setosa: 0
#Iris-versicolor: 1
#Iris-virginica: 2
iris_df['Class'] = iris_df['Class'].astype("category").cat.codes

In [37]:
t = """word_freq_make
    word_freq_address
    word_freq_all
    word_freq_3d
    word_freq_our
    word_freq_over
    word_freq_remove
    word_freq_internet
    word_freq_order
    word_freq_mail
    word_freq_receive
    word_freq_will
    word_freq_people
    word_freq_report
    word_freq_addresses
    word_freq_free
    word_freq_business
    word_freq_email
    word_freq_you
    word_freq_credit
    word_freq_your
    word_freq_font
    word_freq_000
    word_freq_money
    word_freq_hp
    word_freq_hpl
    word_freq_george
    word_freq_650
    word_freq_lab
    word_freq_labs
    word_freq_telnet
    word_freq_857
    word_freq_data
    word_freq_415
    word_freq_85
    word_freq_technology
    word_freq_1999
    word_freq_parts
    word_freq_pm
    word_freq_direct
    word_freq_cs
    word_freq_meeting
    word_freq_original
    word_freq_project
    word_freq_re
    word_freq_edu
    word_freq_table
    word_freq_conference
    char_freq_;
    char_freq_:
    char_freq_[
    char_freq_!
    char_freq_$
    char_freq_#
    capital_run_length_average
    capital_run_length_longest
    capital_run_length_total"""

In [43]:
names = ''.join(t.splitlines()).split()

In [44]:
spam = pd.read_csv('Datasets\spambase.data',names = names)