In [41]:
import pandas as pd
import numpy as np

# Maybe: Print all of this code into Python 
# code to be used or not used by others
# Probably should keep the function names.
class Neat:
    
    def __init__(self, df, targetY, indexColumns=[]):        
        self.df = df        
        self.targetY = self._cleanColumnName(targetY)        
        self.indexColumns = self._cleanColumnNames(indexColumns)            
        self.targetMappings = {}
        self.numberColumns = []
        self.categoryColumns = []
        self.datetimeColumns = []
        
        self._setTargetMappings()
        self._convertTargetToNumeric()
        
        self._cleanDFColumnNames()        
        self.convertTargetColumnToNumeric()
        self._dropNATargets()
        self._dropDuplicatesIfIndexIsSpecified()    
        self._setIndex()
        
        self._setColumnDataTypes()
        
        self._saveMediansAndBounds()
        self._fixMissingValuesAndInfinity()
        self._setValuesToWithinBounds()
    
    def _cleanColumnNames(self, indexColumns):
        if type(indexColumns) == str:
            indexColumns = [indexColumns]
        arr = []
        for column in indexColumns:
            arr.append(self._cleanColumnName(column))
        return arr
    
    def _cleanColumnName(self, string):
        return string.strip().lower().replace(' ', '_')
    
    def _cleanDFColumnNames(self):
        self.df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')    
    
    def _setTargetMappings(self):
        i = 0        
        for value in self.df[self.targetY].unique():
            if value != None and value.strip() != "":
                self.targetMappings[value] = i
                i = i + 1
        
    def _convertTargetToNumeric(self):
        self.df[self.targetY] = self.df[self.targetY].map(targetMappings)
    
    def _dropDuplicatesIfIndexIsSpecified(self):
        if self.indexColumns != []:
            self.df = self.df.drop_duplicates(subset=self.indexColumns)
        
    def _setIndex(self):
        if self.indexColumns == []:
            self.df = self.df.set_index(np.arange(1,len(self.df.index)+1))
        else:
            self.df = self.df.set_index(self.indexColumns)                    
            
    def _setColumnDataTypes(self):
        columns = self.df.columns.values.tolist()
        for column in columns:    
            if self.df[column].dtype == 'int64' or self.df[column].dtype == 'float64':
                self.numberColumns.append(column)
            elif df[column].dtype == 'object':
                self.categoryColumns.append(column)
            else:
                self.datetimeColumns.append(column) 

    def _dropNATargets(self):    
        rowsToDrop = []
        for i, row in self.df.iterrows():    
            rowsToDrop.append(i) if np.isnan(row[self.targetY]) else None            
        self.df = self.df.drop(self.df.index[rowsToDrop])


        
    def _saveMediansAndBounds(self):        
        firstQuantiles = self.df.quantile(.25)
        thirdQuantiles = self.df.quantile(.75)
        
        self.medians = self.df.quantile(.50)
        self.lowerBounds = {}
        self.upperBounds = {}
        for column in self.numberColumns:            
            self.lowerBounds[column] = self.medians[column] - 2*(self.medians[column] - firstQuantiles[column])
            self.upperBounds[column] = self.medians[column] + 2*(thirdQuantiles[column] - self.medians[column])        
        
    def _fixMissingValuesAndInfinity(self):
        self.df = self.df.fillna(self.medians)
        self._fixInfinityValues()
    
    def _fixInfinityValues(self):
        df.replace([np.inf, -np.inf], np.nan)
        self.df = self.df.fillna(self.upperBounds)
        
    def _setValuesToWithinBounds(self):
        for i, row in self.df.iterrows(): 
            for column in self.numberColumns:
                if row[column] > self.upperBounds[column]:
                    row[column] = self.upperBounds[column]
                if row[column] < self.lowerBounds[column]:
                    print("HI")
                    row[column] = self.lowerBounds[column]
                    

df = pd.DataFrame({'col1': ['a','b','c','d','e','g','z','h','i','t'], 'col2': [None,None,None,-10000,9,10,11,12,13,14]})                
targetY = 'col1'
indexColumns = 'col2'


neat = Neat(df, targetY, indexColumns)




neat.df    
    

    
    


['col2']
         col1
col2         
NaN         a
-10000.0    d
 9.0        e
 10.0       g
 11.0       z
 12.0       h
 13.0       i
 14.0       t


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [104]:
df = pd.DataFrame({'col1': [1,1, 2,3,4,5], 'col2': [3, 3,None, None, None,None]})
df
df.columns.values.tolist()
df.dtypes

    
       
        
print(numberColumns)        
print(categoryColumns)        
print(datetimeColumns)        


['col1', 'col2']
['col1', 'col2']
[]
[]


In [181]:
df = pd.DataFrame({'col1': [1,2,None],'col2': [2,4,None]})
#print(df)


#print(df.quantile(.25))
#print(df.quantile(.5))
#print(df.quantile(.75))
print(df.quantile(.5))

df = df.fillna(df.median())




print("A")
print(df)

col1    1.5
col2    3.0
Name: 0.5, dtype: float64
A
   col1  col2
0   1.0   2.0
1   2.0   4.0
2   1.5   3.0


In [144]:
round(2.50)

2

In [190]:
df.median()

col1     6.0
col2    11.0
dtype: float64

In [206]:
quantile1 = df.quantile(.25)
quantile3 = df.quantile(.75)


medians = df.quantile(.50)
print(quantile1)
print(medians)

# for col in medians:
#     print(col)
    





col1    4.5
col2    9.5
Name: 0.25, dtype: float64
col1     6.0
col2    11.0
Name: 0.5, dtype: float64
10.5
20.5


In [26]:
a = ["a"]
if a == []:
    print("HI")

In [37]:
type(['a'])
type('a')==str

True

In [59]:
df = pd.DataFrame({'col1': [1,1, 2,3,4,5], 'col2': ['a', 'b',None, None, None,None]})

# for cat in ['col2','col1']:
#     print("Levels for catgeory '{0}': {1}".format(cat, df[cat].unique()))

# df['col2']=df['col2'].map({'a':0,'b':1})

# df_class = df['col2'].values
# df_class

# for cat in ['col2']:
#     print(cat)
    




{'a': 0, 'b': 1}
