## Data type detection

In [6]:
from datetime import datetime

heuristics = [lambda value: datetime.strptime(value, "%Y-%m-%d"),
              int, float]

def convert(value):
    for type in heuristics:
        try:
            return type(value)
        except ValueError:
            continue
    # All other heuristics failed it is a string
    return value

values = ['3.14159265', '2010-01-20', '16', 'some words']

for value in values:
    converted_value = convert(value)
    print(converted_value, type(converted_value))

3.14159265 <class 'float'>
2010-01-20 00:00:00 <class 'datetime.datetime'>
16 <class 'int'>
some words <class 'str'>


### type() and instance() function

In [140]:
print(type(5))

<class 'int'>


In [141]:
isinstance( 5, int)

True

In [142]:
float("   -12345\n")

-12345.0

In [143]:
 int(2.0)

2

In [144]:
from datetime import datetime
text = '2012-09-20'
datetime.strptime(text, '%Y-%m-%d')

datetime.datetime(2012, 9, 20, 0, 0)

In [145]:
from dateutil.parser import parse
parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)

(datetime.datetime(2047, 1, 1, 8, 21), ('Today is ', ' ', ' ', 'at '))

### String manipulation

In [146]:
word="Data Processing"
word.lower()
word.upper()

'DATA PROCESSING'

In [147]:
print(word.count('a'))

2


In [148]:
print(word.find("D") ) 

0


In [149]:
print( word.index("Pro") )

5


In [150]:
word=" test "
print(word)
print(word.strip())

 test 
test


In [151]:
word="Data Processing"
print(word[0:4])

Data


In [152]:
word.split(' ')  # Split on whitespace

['Data', 'Processing']

In [153]:
print(word.capitalize())

Data processing


In [154]:
float( "100,50")

ValueError: could not convert string to float: '100,50'

In [155]:
"Wien" == "wien"

False

In [156]:
word="DE1001V,AT"
print(word[0:len("DE1001V")])

DE1001V


In [157]:
## Outlier detection

In [68]:
l=[1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90]

In [132]:
def doubleMADsfromMedian(y,thresh=3.5):
    # warning: this function does not check for NAs
    # nor does it address issues when 
    # more than 50% of your data have identical values
    m = np.median(y)
    abs_dev = np.abs(y - m)
    left_mad = np.median(abs_dev[y <= m])
    right_mad = np.median(abs_dev[y >= m])
    y_mad = left_mad * np.ones(len(y))
    y_mad[y > m] = right_mad
    modified_z_score = 0.6745 * abs_dev / y_mad
    modified_z_score[y == m] = 0
    return modified_z_score > thresh

In [133]:
print(doubleMADsfromMedian(l))

[False False False False False False False False False False False False
 False False False False False False  True  True]


In [86]:
def percentile_based_outlier(data, threshold=95):
    # Marks all data points which are below the 2.5% quantile or above the 97.5% quantile
    diff = (100 - threshold) / 2.0
    minval, maxval = np.percentile(data, [diff, 100 - diff])
    return (data < minval) | (data > maxval)

In [87]:
print(percentile_based_outlier(l))

[ True False False False False False False False False False False False
 False False False False False False False  True]


In [104]:
def xAwayFromStddev(points, x=2):
    mean= np.mean(points)
    stddev=np.std(points)
    abs_dev = np.abs(points - mean)
    return (abs_dev>x*stddev)

In [106]:
print(xAwayFromStddev(l))

[False False False False False False False False False False False False
 False False False False False False False  True]


In [136]:
def interQuantil(points, distance=3):
    q25, q75=np.percentile(points, [25, 75])
    IQ= q75-q25
    minval= q25-distance*IQ
    maxval= q75+distance*IQ
    return (points < minval) | (points > maxval)

In [137]:
print(interQuantil(l))

[False False False False False False False False False False False False
 False False False False False False  True  True]


In [138]:
print(interQuantil(l, distance=1.5))

[False False False False False False False False False False False False
 False False False False False False  True  True]


## Transposing lists of lists and CSV to JSON

In [139]:
a = [['city','state','population'],['Wien','Wien','1852997'],['Salzburg','Salzburg','150887'],['Innsbruck','Tyrol','130,894']]
z=zip(*a)
transposed=[]
for i in z:
    transposed.append(list(i))

print(transposed)


[['city', 'Wien', 'Salzburg', 'Innsbruck'], ['state', 'Wien', 'Salzburg', 'Tyrol'], ['population', '1852997', '150887', '130,894']]


In [128]:
atcities = [['city','state','population'],['Wien','Wien','1852997'],['Salzburg','Salzburg','150887'],['Innsbruck','Tyrol','130,894']]
atCitiesDict=[]

header=atcities[0]
for city in atcities[1:]:
    d={}
    for i,h in enumerate(header):
        d[h]=city[i]
    atCitiesDict.append(d)
print(atCitiesDict)

[{'city': 'Wien', 'population': '1852997', 'state': 'Wien'}, {'city': 'Salzburg', 'population': '150887', 'state': 'Salzburg'}, {'city': 'Innsbruck', 'population': '130,894', 'state': 'Tyrol'}]


In [130]:
atCitiesDict={}
header=atcities[0]
atCitiesDict={ h:[] for h in header}

for city in atcities[1:]:
    for i,h in enumerate(header):
        atCitiesDict[h].append(city[i])
print(atCitiesDict)

{'city': ['Wien', 'Salzburg', 'Innsbruck'], 'population': ['1852997', '150887', '130,894'], 'state': ['Wien', 'Salzburg', 'Tyrol']}
