In [87]:
import pandas as pd
import numpy as np
import random

# Maybe: Print all of this code into Python 
# code to be used or not used by others
# Probably should keep the function names.
class Neat:
    
    def __init__(self, df, targetY, indexColumns=[]):        
        self.df = df        
        self.targetY = self._cleanColumnName(targetY)        
        self.indexColumns = self._cleanColumnNamesArray(indexColumns)            
        self.targetMappings = {}
        self.numberColumns = []
        self.categoryColumns = []
        self.datetimeColumns = []
        self.medians = []
        self.lowerBounds = []
        self.upperBounds = []
        self.modes = []
        self.uniqueCategoryValues = {}
        # TargetY
        self._setTargetMappings()
        self._convertTargetToNumeric()
        self._dropNATargetRows()        
        # Column Metadata
        self._cleanColumnNamesDF()        
        self._setColumnDataTypes()        
        # Index
        self._dropDuplicatesAndMissingRowsIfIndexIsSpecified()    
        self._addIndex()
        # Numbers
        self._saveMediansAndBounds()
        self._fixMissingNumValuesAndInfinity()
        self._setAllValuesWithinBounds()
        # Categories
        self._saveModes()
        self._fixMissingCatValues()
        self._saveUniqueCategoryValues()        
    
    def _cleanColumnNamesArray(self, indexColumns):
        if type(indexColumns) == str:
            indexColumns = [indexColumns]
        arr = []
        for column in indexColumns:
            arr.append(self._cleanColumnName(column))
        return arr
    
    def _cleanColumnName(self, string):
        return string.strip().lower().replace(' ', '_')
    
    def _setTargetMappings(self):
        if df[self.targetY].dtype == 'object': # is a string
            i = 0      
            for value in self.df[self.targetY].unique():
                if value != None and value.strip() != "":
                    self.targetMappings[value] = i
                    i = i + 1

    def _convertTargetToNumeric(self):
        if df[self.targetY].dtype == 'object': # is a string        
            self.df[self.targetY] = self.df[self.targetY].map(self.targetMappings)
    
    def _dropNATargetRows(self):    
        rowsToDrop = []
        for i, row in self.df.iterrows():    
            rowsToDrop.append(i) if np.isnan(row[self.targetY]) else None            
        self.df = self.df.drop(self.df.index[rowsToDrop])    
    
    def _cleanColumnNamesDF(self):
        self.df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')    
        
    def _setColumnDataTypes(self):
        columns = self.df.columns.values.tolist()
        for column in columns:    
            if column == self.targetY or column in indexColumns:
                pass
            elif self.df[column].dtype == 'int64' or self.df[column].dtype == 'float64':
                self.numberColumns.append(column)
            elif df[column].dtype == 'object':
                self.categoryColumns.append(column)
            else:
                self.datetimeColumns.append(column)     
    
    def _dropDuplicatesAndMissingRowsIfIndexIsSpecified(self):
        rowsToDrop = []
        if self.indexColumns != []:
            self.df = self.df.drop_duplicates(subset=self.indexColumns)
            for i, row in self.df.iterrows(): 
                for column in self.indexColumns:
                    if ((self.df[column].dtype == 'int64' or self.df[column].dtype == 'float64') and (np.isnan(row[column]) or np.isinf(row[column]))) or row[column] == None:
                        rowsToDrop.append(i)
        self.df = self.df.drop(self.df.index[rowsToDrop])    
        
    def _addIndex(self):
        if self.indexColumns == []:
            self.df = self.df.set_index(np.arange(1,len(self.df.index)+1))
        else:
            self.df = self.df.set_index(self.indexColumns)                    
        
    def _saveMediansAndBounds(self):        
        firstQuantiles = self.df.quantile(.25)
        thirdQuantiles = self.df.quantile(.75)
        
        self.medians = self.df.quantile(.50)
        self.lowerBounds = {}
        self.upperBounds = {}
        for column in self.numberColumns:            
            self.lowerBounds[column] = self.medians[column] - 2*(self.medians[column] - firstQuantiles[column])
            self.upperBounds[column] = self.medians[column] + 2*(thirdQuantiles[column] - self.medians[column])        
        
    def _fixMissingNumValuesAndInfinity(self):
        self.df = self.df.fillna(self.medians) # optionally: replace self.medians with 0
        df.replace([np.inf, -np.inf], np.nan)
        self.df = self.df.fillna(self.upperBounds)        
        
    def _setAllValuesWithinBounds(self):
        for i, row in self.df.iterrows(): 
            for column in self.numberColumns:
                if row[column] > self.upperBounds[column]:
                    self.df.at[i, column] = self.upperBounds[column]
                if row[column] < self.lowerBounds[column]:
                    self.df.at[i, column] = self.lowerBounds[column]
      
    def _saveModes(self):        
        self.modes = self.df.mode()
        
    def _fixMissingCatValues(self):
        for i, row in self.df.iterrows(): 
            for column in self.categoryColumns:        
                if row[column] == None:
                    self.df.at[i, column] = random.choice(self.modes[column]) # random if tied

    def _saveUniqueCategoryValues(self):
        for column in self.categoryColumns:
            self.uniqueCategoryValues[column] = self.df[column].unique()
                            
                    
df = pd.DataFrame({'col1': ['a','b','c','d','e','g','z','z','i','t'], 'col2': [None,None,None,9,5,10,11,12,13,14]
                  , 'col3': ['test1','test1','test1','test99',None,None,'test98','test2','test2','test3']
                  , 'col4': [None, 5, 3 ,6 ,8, 10, 14, 87, 999 ,9999]
                  , 'col5': ['test1','test1','test1','test99',None,None,'test98','test2','test2','test3']})                
targetY = 'col1'
#indexColumns = 'col2'
indexColumns = ['col4']

neat = Neat(df, targetY, indexColumns)

neat.df    
    

    
    


['test1' 'test99' 'test2' 'test98' 'test3']


Unnamed: 0_level_0,col1,col2,col3,col5
col4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.0,1,11.0,test1,test1
3.0,2,11.0,test1,test1
6.0,3,9.0,test99,test99
8.0,4,8.0,test2,test1
10.0,5,10.0,test2,test2
14.0,6,11.0,test98,test98
87.0,6,12.0,test2,test2
999.0,7,13.0,test2,test2
9999.0,8,14.0,test3,test3


In [104]:
df = pd.DataFrame({'col1': [1,1, 2,3,4,5], 'col2': [3, 3,None, None, None,None]})
df
df.columns.values.tolist()
df.dtypes

    
       
        
print(numberColumns)        
print(categoryColumns)        
print(datetimeColumns)        


['col1', 'col2']
['col1', 'col2']
[]
[]


In [181]:
df = pd.DataFrame({'col1': [1,2,None],'col2': [2,4,None]})
#print(df)


#print(df.quantile(.25))
#print(df.quantile(.5))
#print(df.quantile(.75))
print(df.quantile(.5))

df = df.fillna(df.median())




print("A")
print(df)

col1    1.5
col2    3.0
Name: 0.5, dtype: float64
A
   col1  col2
0   1.0   2.0
1   2.0   4.0
2   1.5   3.0


In [144]:
round(2.50)

2

In [190]:
df.median()

col1     6.0
col2    11.0
dtype: float64

In [206]:
quantile1 = df.quantile(.25)
quantile3 = df.quantile(.75)


medians = df.quantile(.50)
print(quantile1)
print(medians)

# for col in medians:
#     print(col)
    





col1    4.5
col2    9.5
Name: 0.25, dtype: float64
col1     6.0
col2    11.0
Name: 0.5, dtype: float64
10.5
20.5


In [26]:
a = ["a"]
if a == []:
    print("HI")

In [37]:
type(['a'])
type('a')==str

True

In [65]:
df = pd.DataFrame({'col1': [1,1, 2,3,4,5], 'col2': ['a', 'b',None, None, None,None]})

# for cat in ['col2','col1']:
#     print("Levels for catgeory '{0}': {1}".format(cat, df[cat].unique()))

# df['col2']=df['col2'].map({'a':0,'b':1})

# df_class = df['col2'].values
# df_class

# for cat in ['col2']:
#     print(cat)
    
i = 0
targetMappings = {}
for value in df['col2'].unique():
    if value != None and value.strip() != "":
        targetMappings[value] = i
        i = i + 1
print(targetMappings)


df['col2'] = df['col2'].map(targetMappings)
df


{'a': 0, 'b': 1}


Unnamed: 0,col1,col2
0,1,0.0
1,1,1.0
2,2,
3,3,
4,4,
5,5,


In [67]:
rowsToDrop = []
for i, row in df.iterrows():    
    rowsToDrop.append(i) if np.isnan(row['col2']) else None            
df = df.drop(df.index[rowsToDrop])
df

Unnamed: 0,col1,col2
0,1,0.0
1,1,1.0
