# 2 - Working with Data
As an experimental physicist, you are mostly working with data sets. Data preparation and inspection belong to the major and most time consuming tasks. An efficient way to deal with them is therefore essential!

This notebook covers the usage of the following packages:
* NumPy
* Pandas

## NumPy
The NumPy package provides very useful array objects. Moreover, it provdies tools for linear algebra, Fourier transform, and random number generations.
It is widely used in the scientific community.

In [1]:
# First we have to import the package
# At the same time, we introduce an abbrevation for it
# Typically, np is used for numpy
import numpy as np

#### Creation of numpy arrays

In [2]:
# Create a numpy array
a = np.array([1,2,3,4,5])
a

array([1, 2, 3, 4, 5])

In [3]:
# Perform some mathmatical transformations
a * 2

array([ 2,  4,  6,  8, 10])

In [4]:
# Remember what happened for lists
2 * [1,2,3,4,5]

[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]

In [5]:
# More transformations
a**2

array([ 1,  4,  9, 16, 25])

In [6]:
# numpy provides also more high-level functions
np.sin(a)

array([ 0.84147098,  0.90929743,  0.14112001, -0.7568025 , -0.95892427])

In [7]:
# N-dimensional arrays are also possible. Here 2-dimensional array with 2 rows, 5 columns:
a_2d = np.array([[1,2,3,4,5], [6,7,8,9,10]])
a_2d

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [8]:
# Get the shape (number of rows and columns) of the array
np.shape(a_2d)

(2, 5)

In [9]:
# Get the dimension of the array
np.ndim(a_2d)

2

In [10]:
# Get the number of entries of the array
np.size(a_2d)

10

In [11]:
# Array of zeros of size n
n = 3
np.zeros(n)

array([0., 0., 0.])

In [12]:
# Array of zeros of size n
n = 3
np.ones(n)

array([1., 1., 1.])

In [13]:
# N-dim array of zeros of size nxm
n = 3
m = 2
np.zeros((n,m))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [14]:
# Create an array with n entries, increasing by 1
n = 5
np.arange(n)

array([0, 1, 2, 3, 4])

In [15]:
# Create an array in the range from start to stop, increasing by 1
start = 4.5
stop = 7
np.arange(start, stop)

array([4.5, 5.5, 6.5])

In [16]:
# Create an array in the range from start to stop, increasing by increment
start = 5
stop = 6
increment = 0.1
np.arange(start, stop, increment)

array([5. , 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9])

In [17]:
# Create an array from start to stop with n entries
start = 0
stop = 2
n = 11
np.linspace(start, stop, n)

array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ])

In [18]:
# Create an array of n entries in the range 10^start to 10^stop
start = 2
stop = 5
n = 10
np.logspace(2,4,10)

array([  100.        ,   166.81005372,   278.25594022,   464.15888336,
         774.26368268,  1291.54966501,  2154.43469003,  3593.8136638 ,
        5994.84250319, 10000.        ])

#### Accessing (indexing) numpy arrays

In [19]:
# Let's have a look again to our array
a_2d

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [20]:
# Access entries of the array

# Access a row
print('row 1:', a_2d[1])

# Acess a column
print('column 1:', a_2d[:,1])

row 1: [ 6  7  8  9 10]
column 1: [2 7]


In [21]:
# Remember the indexing from the previous chapter, same for numpy arrays
a_2d[0,2:]

array([3, 4, 5])

In [22]:
# Alternative possibility
a_2d[0][2:]

array([3, 4, 5])

In [23]:
# Last row, last column
a_2d[-1][-1]

10

In [24]:
# Second last row, second last column
a_2d[-2][-2]

4

In [25]:
# Change value of a specific entry
a_2d[0,0] = 0
a_2d

array([[ 0,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [26]:
# Change value of a specific row
a_2d[0] = 42
a_2d

array([[42, 42, 42, 42, 42],
       [ 6,  7,  8,  9, 10]])

In [27]:
# Transpose the array
print('Shape of array:', a_2d.shape)
print('Shape of transposed array:', a_2d.T.shape)
a_2d.T

Shape of array: (2, 5)
Shape of transposed array: (5, 2)


array([[42,  6],
       [42,  7],
       [42,  8],
       [42,  9],
       [42, 10]])

#### Selection of Sub-Arrays

In [28]:
a = np.arange(0,20)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [29]:
# Selection of sub-arrays is done with so-called masks, containing booleans
mask = a < 2
print('Mask:', mask)
a[mask]

Mask: [ True  True False False False False False False False False False False
 False False False False False False False False]


array([0, 1])

In [30]:
# Mask with multiple conditions
mask = (a > 2) & (a < 10)
a[mask]

array([3, 4, 5, 6, 7, 8, 9])

In [31]:
# Mask with multiple conditions
mask = (a > 17) | (a < 2)
a[mask]

array([ 0,  1, 18, 19])

In [32]:
b = np.arange(100,120)
b

array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119])

#### Useful functions

In [33]:
# Random generator of one random number - uniform distribution between 0 and 1
np.random.random()

0.5197234989521826

In [34]:
# Random generator of a 2dim array - uniform distribution between 0 and 1
np.random.random((2,3))

array([[0.24276049, 0.55566278, 0.03601156],
       [0.64291455, 0.42962356, 0.04593152]])

In [35]:
# Random generator of 20 numbers - normal distribution with mean 0, std 1
r = np.random.normal(0,1,20)
r

array([ 1.22384995,  1.94548694, -0.07398076, -1.01636887,  0.14566567,
       -0.86687913, -0.36590772, -0.95800163,  0.35429152, -0.10758847,
       -1.00751121,  0.84663729, -1.48084824,  1.56266073, -0.22637452,
       -1.51497904,  0.15002476,  0.67561   ,  1.08899415, -2.06173567])

In [36]:
# Sum of array
r.sum()

-1.6869542469650791

In [37]:
# Minimum and Maximum of array
print('Min: ', r.min())
print('Max: ', r.max())

Min:  -2.0617356738750603
Max:  1.9454869376281472


In [38]:
# Cumulative sum
r.cumsum()

array([ 1.22384995,  3.16933689,  3.09535614,  2.07898726,  2.22465294,
        1.35777381,  0.99186609,  0.03386446,  0.38815598,  0.28056751,
       -0.7269437 ,  0.11969359, -1.36115465,  0.20150607, -0.02486845,
       -1.53984749, -1.38982273, -0.71421273,  0.37478143, -1.68695425])

In [39]:
# Random generator of a 2dim array - uniform distribution between 0 and 1
r = np.random.random((2,3))
r

array([[0.74968576, 0.68880807, 0.80395255],
       [0.26763378, 0.27534977, 0.55964197]])

In [40]:
# Sum of Ndim array along different axes (Same for min, max)
print('axis 0', r.sum(axis=0))
print('axis 1', r.sum(axis=1))

axis 0 [1.01731954 0.96415784 1.36359453]
axis 1 [2.24244638 1.10262553]


In [41]:
# Difference between neighbouring entries
x = np.linspace(0,10,11)
print('x:', x)
np.diff(x**2)

x: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]


array([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19.])

In [42]:
# Pi
np.pi

3.141592653589793

In [43]:
# Create some data
a = np.array([[1,2],[3,4]])
b = np.array([['a','b'], ['c','d']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# Shape of vstacked (vertically stacked) arrays is different
print('shape vstack(a,b):', np.vstack((a,b)).shape)

# This is how a vstacked arrray looks like
np.vstack((a,b))

shape a: (2, 2)
shape b: (2, 2)
shape vstack(a,b): (4, 2)


array([['1', '2'],
       ['3', '4'],
       ['a', 'b'],
       ['c', 'd']], dtype='<U21')

In [44]:
# Change the array a bit
b = np.array([['a','b'], ['c','d'],['e','f']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# And try again
np.vstack((a,b))

shape a: (2, 2)
shape b: (3, 2)


array([['1', '2'],
       ['3', '4'],
       ['a', 'b'],
       ['c', 'd'],
       ['e', 'f']], dtype='<U21')

In [45]:
# Change the array a bit
b = np.array([['a','b','c'], ['d','e','f']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# And try again
np.vstack((a,b))

shape a: (2, 2)
shape b: (2, 3)


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [46]:
# Let's do the same with hstack (horizontally stacked)

# Change the array a bit
b = np.array([['a','b'], ['c','d'],['e','f']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# And try again
np.hstack((a,b))

shape a: (2, 2)
shape b: (3, 2)


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [47]:
# Change the array a bit
b = np.array([['a','b','c'], ['d','e','f']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# And try again
np.hstack((a,b))

shape a: (2, 2)
shape b: (2, 3)


array([['1', '2', 'a', 'b', 'c'],
       ['3', '4', 'd', 'e', 'f']], dtype='<U21')

In [48]:
# Have a look to column stack

# Change the array a bit
b = np.array([['a','b','c'], ['d','e','f']])

# Shape of both arrays look like this
print('shape a:', a.shape)
print('shape b:', b.shape)

# And try again
np.column_stack((a,b))

shape a: (2, 2)
shape b: (2, 3)


array([['1', '2', 'a', 'b', 'c'],
       ['3', '4', 'd', 'e', 'f']], dtype='<U21')

In [49]:
c = np.array([1,2,3,4,5,6,7,8])

# Split the array (horizontally) in two arrays
np.hsplit(c,2)

[array([1, 2, 3, 4]), array([5, 6, 7, 8])]

In [50]:
# Split the array (vertically) in two arrays
np.vsplit(a,2)

[array([[1, 2]]), array([[3, 4]])]

#### Reading and writing files

In [51]:
# Let's first create some data
x = np.linspace(0,100,20)
y = np.arange(20)

# Now we save it
np.savetxt('test.txt', [x,y])

# And now we open it again
with open('test.txt', 'r') as f:
    print(f.read())

0.000000000000000000e+00 5.263157894736842479e+00 1.052631578947368496e+01 1.578947368421052744e+01 2.105263157894736992e+01 2.631578947368421240e+01 3.157894736842105488e+01 3.684210526315789735e+01 4.210526315789473983e+01 4.736842105263158231e+01 5.263157894736842479e+01 5.789473684210526727e+01 6.315789473684210975e+01 6.842105263157895934e+01 7.368421052631579471e+01 7.894736842105263008e+01 8.421052631578947967e+01 8.947368421052632925e+01 9.473684210526316463e+01 1.000000000000000000e+02
0.000000000000000000e+00 1.000000000000000000e+00 2.000000000000000000e+00 3.000000000000000000e+00 4.000000000000000000e+00 5.000000000000000000e+00 6.000000000000000000e+00 7.000000000000000000e+00 8.000000000000000000e+00 9.000000000000000000e+00 1.000000000000000000e+01 1.100000000000000000e+01 1.200000000000000000e+01 1.300000000000000000e+01 1.400000000000000000e+01 1.500000000000000000e+01 1.600000000000000000e+01 1.700000000000000000e+01 1.800000000000000000e+01 1.900000000000000000e+01


In [52]:
# Now we save the data differently
np.savetxt('test2.txt', np.column_stack([x,y]))

with open('test2.txt', 'r') as f:
    print(f.read())

0.000000000000000000e+00 0.000000000000000000e+00
5.263157894736842479e+00 1.000000000000000000e+00
1.052631578947368496e+01 2.000000000000000000e+00
1.578947368421052744e+01 3.000000000000000000e+00
2.105263157894736992e+01 4.000000000000000000e+00
2.631578947368421240e+01 5.000000000000000000e+00
3.157894736842105488e+01 6.000000000000000000e+00
3.684210526315789735e+01 7.000000000000000000e+00
4.210526315789473983e+01 8.000000000000000000e+00
4.736842105263158231e+01 9.000000000000000000e+00
5.263157894736842479e+01 1.000000000000000000e+01
5.789473684210526727e+01 1.100000000000000000e+01
6.315789473684210975e+01 1.200000000000000000e+01
6.842105263157895934e+01 1.300000000000000000e+01
7.368421052631579471e+01 1.400000000000000000e+01
7.894736842105263008e+01 1.500000000000000000e+01
8.421052631578947967e+01 1.600000000000000000e+01
8.947368421052632925e+01 1.700000000000000000e+01
9.473684210526316463e+01 1.800000000000000000e+01
1.000000000000000000e+02 1.900000000000000000e+01


In [53]:
# Now we also add a header (column names)
np.savetxt('test3.txt', np.column_stack([x,y]), header='x y')

with open('test3.txt', 'r') as f:
    print(f.read())

# x y
0.000000000000000000e+00 0.000000000000000000e+00
5.263157894736842479e+00 1.000000000000000000e+00
1.052631578947368496e+01 2.000000000000000000e+00
1.578947368421052744e+01 3.000000000000000000e+00
2.105263157894736992e+01 4.000000000000000000e+00
2.631578947368421240e+01 5.000000000000000000e+00
3.157894736842105488e+01 6.000000000000000000e+00
3.684210526315789735e+01 7.000000000000000000e+00
4.210526315789473983e+01 8.000000000000000000e+00
4.736842105263158231e+01 9.000000000000000000e+00
5.263157894736842479e+01 1.000000000000000000e+01
5.789473684210526727e+01 1.100000000000000000e+01
6.315789473684210975e+01 1.200000000000000000e+01
6.842105263157895934e+01 1.300000000000000000e+01
7.368421052631579471e+01 1.400000000000000000e+01
7.894736842105263008e+01 1.500000000000000000e+01
8.421052631578947967e+01 1.600000000000000000e+01
8.947368421052632925e+01 1.700000000000000000e+01
9.473684210526316463e+01 1.800000000000000000e+01
1.000000000000000000e+02 1.90000000000000000

In [54]:
# Read the file in again
a = np.loadtxt('test3.txt')
print('a: \n', a, '\n')
print('a[0]: \n', a[0], '\n')
print('a.T[0]: \n', a.T[0])

a: 
 [[  0.           0.        ]
 [  5.26315789   1.        ]
 [ 10.52631579   2.        ]
 [ 15.78947368   3.        ]
 [ 21.05263158   4.        ]
 [ 26.31578947   5.        ]
 [ 31.57894737   6.        ]
 [ 36.84210526   7.        ]
 [ 42.10526316   8.        ]
 [ 47.36842105   9.        ]
 [ 52.63157895  10.        ]
 [ 57.89473684  11.        ]
 [ 63.15789474  12.        ]
 [ 68.42105263  13.        ]
 [ 73.68421053  14.        ]
 [ 78.94736842  15.        ]
 [ 84.21052632  16.        ]
 [ 89.47368421  17.        ]
 [ 94.73684211  18.        ]
 [100.          19.        ]] 

a[0]: 
 [0. 0.] 

a.T[0]: 
 [  0.           5.26315789  10.52631579  15.78947368  21.05263158
  26.31578947  31.57894737  36.84210526  42.10526316  47.36842105
  52.63157895  57.89473684  63.15789474  68.42105263  73.68421053
  78.94736842  84.21052632  89.47368421  94.73684211 100.        ]


In [55]:
a, b = np.genfromtxt('test3.txt', unpack=True)
a, b

(array([  0.        ,   5.26315789,  10.52631579,  15.78947368,
         21.05263158,  26.31578947,  31.57894737,  36.84210526,
         42.10526316,  47.36842105,  52.63157895,  57.89473684,
         63.15789474,  68.42105263,  73.68421053,  78.94736842,
         84.21052632,  89.47368421,  94.73684211, 100.        ]),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19.]))

## Pandas
Pandas is based on NumPy. Here, the objects are called *DataFrames*.
It offers nice visual inspections. Moreover, the code is easily readable.

Pandas can also handle hdf5 format and convert tables to latex.

In [56]:
# Again, we have to import the package first
# At the same time, we introduce an abbrevation for it
# Typically, pd is used for pandas
import pandas as pd

#### Reading

In [57]:
# Let's have a look to the files we created in the numpy section
pd.read_csv('test2.txt')

Unnamed: 0,0.000000000000000000e+00 0.000000000000000000e+00
0,5.263157894736842479e+00 1.000000000000000000e+00
1,1.052631578947368496e+01 2.000000000000000000e+00
2,1.578947368421052744e+01 3.000000000000000000e+00
3,2.105263157894736992e+01 4.000000000000000000e+00
4,2.631578947368421240e+01 5.000000000000000000e+00
5,3.157894736842105488e+01 6.000000000000000000e+00
6,3.684210526315789735e+01 7.000000000000000000e+00
7,4.210526315789473983e+01 8.000000000000000000e+00
8,4.736842105263158231e+01 9.000000000000000000e+00
9,5.263157894736842479e+01 1.000000000000000000e+01


In [58]:
# Something is not ok... it is reading the complete line, instead of different entries
# Let's tell him a bit about the file - e.g. the separator between the entries is a space
pd.read_csv('test2.txt', sep=' ')

Unnamed: 0,0.000000000000000000e+00,0.000000000000000000e+00.1
0,5.263158,1.0
1,10.526316,2.0
2,15.789474,3.0
3,21.052632,4.0
4,26.315789,5.0
5,31.578947,6.0
6,36.842105,7.0
7,42.105263,8.0
8,47.368421,9.0
9,52.631579,10.0


In [59]:
# Ok, almost... it is expecting a header which the file does not have
pd.read_csv('test2.txt', sep=' ', names=['x','y'])

Unnamed: 0,x,y
0,0.0,0.0
1,5.263158,1.0
2,10.526316,2.0
3,15.789474,3.0
4,21.052632,4.0
5,26.315789,5.0
6,31.578947,6.0
7,36.842105,7.0
8,42.105263,8.0
9,47.368421,9.0


In [60]:
# Read a more complex files now
df = pd.read_csv('Data_small.csv')
df

Unnamed: 0,# This is a file containing data from 2 Cherenkov Telescopes
0,# The number specifies the telescope
1,#
2,# Description of the features
3,# Length: Length of Hillas ellipse
4,# Width: Width of Hillas ellipse
5,# Size: Sum of all charges
6,# Hadronness: Indicating particle type
7,#
8,Length1 Width1 Size1 Length2 Width2 Size2 Ener...
9,75.4124 29.4279 1672.8 59.6897 26.7615 2118.77...


In [61]:
# Read a more complex files now
df = pd.read_csv('Data_small.csv', sep=' ', skiprows=9)
df[:20]

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
0,75.4124,29.4279,1672.8,59.6897,26.7615,2118.77,891.741,0.009167
1,57.7336,20.8956,760.578,56.0803,31.6496,1160.58,618.704,0.163583
2,18.7674,18.0485,56.1875,17.5254,11.8229,56.8125,78.3127,0.245667
3,62.7512,27.3413,1555.16,60.9768,32.2978,1519.82,699.244,0.055498
4,50.9237,22.7845,622.352,67.5829,28.0278,2022.25,2046.44,0.005833
5,55.0872,26.0323,1554.05,78.1673,34.5042,7360.0,6758.78,0.0
6,63.5209,25.0671,808.055,50.8012,23.7339,502.906,1851.45,0.039286
7,73.2049,33.0173,12674.7,51.4093,32.8175,3265.68,27952.1,0.016333
8,83.9538,29.1,3379.82,66.9842,29.0803,1957.11,5800.27,0.0
9,85.886,31.2477,10361.3,63.8771,31.4983,1891.22,7402.94,0.001667


#### Accessing (indexing) pandas dataframes

In [62]:
# First let's shorten the list a bit and copy to other variable
d = df[0:5].copy()
d

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
0,75.4124,29.4279,1672.8,59.6897,26.7615,2118.77,891.741,0.009167
1,57.7336,20.8956,760.578,56.0803,31.6496,1160.58,618.704,0.163583
2,18.7674,18.0485,56.1875,17.5254,11.8229,56.8125,78.3127,0.245667
3,62.7512,27.3413,1555.16,60.9768,32.2978,1519.82,699.244,0.055498
4,50.9237,22.7845,622.352,67.5829,28.0278,2022.25,2046.44,0.005833


In [63]:
# Access of columns
d.Length1

0    75.4124
1    57.7336
2    18.7674
3    62.7512
4    50.9237
Name: Length1, dtype: float64

In [64]:
# Alternative access
d['Length1']

0    75.4124
1    57.7336
2    18.7674
3    62.7512
4    50.9237
Name: Length1, dtype: float64

In [65]:
# Access of a row
d[0:1]

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
0,75.4124,29.4279,1672.8,59.6897,26.7615,2118.77,891.741,0.009167


In [66]:
# Access of a subset - same as before
d[2:4]

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
2,18.7674,18.0485,56.1875,17.5254,11.8229,56.8125,78.3127,0.245667
3,62.7512,27.3413,1555.16,60.9768,32.2978,1519.82,699.244,0.055498


In [67]:
# Mask with one condition
d[d.Length1 < 60]

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
1,57.7336,20.8956,760.578,56.0803,31.6496,1160.58,618.704,0.163583
2,18.7674,18.0485,56.1875,17.5254,11.8229,56.8125,78.3127,0.245667
4,50.9237,22.7845,622.352,67.5829,28.0278,2022.25,2046.44,0.005833


In [68]:
# Mask with mulitple conditions
d[(d.Length1 < 60) & (d.Width1 > 20)]

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
1,57.7336,20.8956,760.578,56.0803,31.6496,1160.58,618.704,0.163583
4,50.9237,22.7845,622.352,67.5829,28.0278,2022.25,2046.44,0.005833


#### Useful functions

In [69]:
# Have a look at the first 5 entries
df.head(5)

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
0,75.4124,29.4279,1672.8,59.6897,26.7615,2118.77,891.741,0.009167
1,57.7336,20.8956,760.578,56.0803,31.6496,1160.58,618.704,0.163583
2,18.7674,18.0485,56.1875,17.5254,11.8229,56.8125,78.3127,0.245667
3,62.7512,27.3413,1555.16,60.9768,32.2978,1519.82,699.244,0.055498
4,50.9237,22.7845,622.352,67.5829,28.0278,2022.25,2046.44,0.005833


In [70]:
# Get some statistics from the data
df.describe()

Unnamed: 0,Length1,Width1,Size1,Length2,Width2,Size2,Energy,Hadronness
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,54.176255,22.81213,1646.708297,54.224245,24.654722,1868.007092,1671.549362,0.142014
std,20.908341,6.793258,3985.511581,19.996187,8.701513,5345.582758,3823.248354,0.185032
min,11.9245,10.9139,32.0703,17.5254,11.0144,38.7422,38.4349,0.0
25%,39.066,18.0657,158.777,37.10195,18.6734,149.844,153.7445,0.038809
50%,54.3526,21.6903,458.938,56.6345,23.5644,485.367,500.48,0.085083
75%,67.8478,26.06565,1397.465,66.6324,28.8085,1515.375,1516.105,0.168047
max,101.04,47.4503,30939.3,105.204,68.4395,47242.8,27952.1,0.946246


In [71]:
df[3:5].sum()

Length1        113.674900
Width1          50.125800
Size1         2177.512000
Length2        128.559700
Width2          60.325600
Size2         3542.070000
Energy        2745.684000
Hadronness       0.061331
dtype: float64

In [72]:
df[3:5].mean()

Length1         56.837450
Width1          25.062900
Size1         1088.756000
Length2         64.279850
Width2          30.162800
Size2         1771.035000
Energy        1372.842000
Hadronness       0.030666
dtype: float64

In [73]:
df.count()

Length1       91
Width1        91
Size1         91
Length2       91
Width2        91
Size2         91
Energy        91
Hadronness    91
dtype: int64

In [74]:
# Minimum for each column - pandas function
d.min()

Length1       18.767400
Width1        18.048500
Size1         56.187500
Length2       17.525400
Width2        11.822900
Size2         56.812500
Energy        78.312700
Hadronness     0.005833
dtype: float64

In [75]:
# Minimum for each column - numpy function
np.min(d)

Length1       18.767400
Width1        18.048500
Size1         56.187500
Length2       17.525400
Width2        11.822900
Size2         56.812500
Energy        78.312700
Hadronness     0.005833
dtype: float64

In [76]:
# Minimum for each _column_ (axis=0)
np.min(d, axis=0)

Length1       18.767400
Width1        18.048500
Size1         56.187500
Length2       17.525400
Width2        11.822900
Size2         56.812500
Energy        78.312700
Hadronness     0.005833
dtype: float64

In [77]:
# Minimum for each _row_ (axis=1)
np.min(d, axis=1)

0    0.009167
1    0.163583
2    0.245667
3    0.055498
4    0.005833
dtype: float64

In [78]:
# Also the pandas function has this functionality
d.min(axis=1)

0    0.009167
1    0.163583
2    0.245667
3    0.055498
4    0.005833
dtype: float64

In [79]:
# In case we have NaN (Not a Number) values in our data, this function is very handy
np.nanmin(df, axis=0)

array([11.9245, 10.9139, 32.0703, 17.5254, 11.0144, 38.7422, 38.4349,
        0.    ])

In [80]:
np.nanmin(df, axis=1)

array([0.00916667, 0.163583  , 0.245667  , 0.0554978 , 0.00583333,
       0.        , 0.0392857 , 0.0163333 , 0.        , 0.00166667,
       0.946246  , 0.0180909 , 0.373571  , 0.384298  , 0.106333  ,
       0.937333  , 0.0186071 , 0.0183333 , 0.0472222 , 0.0872619 ,
       0.224726  , 0.116889  , 0.0673333 , 0.17225   , 0.044     ,
       0.151798  , 0.0696111 , 0.0692778 , 0.0219167 , 0.293894  ,
       0.0363571 , 0.134667  , 0.492183  , 0.0498333 , 0.0770833 ,
       0.105215  , 0.0430794 , 0.127016  , 0.109992  , 0.0874286 ,
       0.0677381 , 0.106813  , 0.188274  , 0.189373  , 0.114044  ,
       0.297012  , 0.0443333 , 0.0448333 , 0.133982  , 0.0383333 ,
       0.293726  , 0.        , 0.924464  , 0.07025   , 0.102123  ,
       0.164095  , 0.182988  , 0.        , 0.10419   , 0.0780043 ,
       0.00392857, 0.110643  , 0.549095  , 0.138444  , 0.262869  ,
       0.002     , 0.149722  , 0.267023  , 0.176969  , 0.0303333 ,
       0.        , 0.0558413 , 0.150857  , 0.066     , 0.05255

In [81]:
# You can also calculate percentiles
np.percentile(df, 68)

97.937228

In [82]:
# There's also a percentile function that can ignore NaNs
np.nanpercentile(df, 68)

97.937228

### Writing

In [83]:
# Dump to file
d.to_csv('test_pandas.csv', index=False)

In [84]:
# Convert table to latex
print(d.to_latex())

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  Length1 &   Width1 &      Size1 &  Length2 &   Width2 &      Size2 &     Energy &  Hadronness \\
\midrule
0 &  75.4124 &  29.4279 &  1672.8000 &  59.6897 &  26.7615 &  2118.7700 &   891.7410 &    0.009167 \\
1 &  57.7336 &  20.8956 &   760.5780 &  56.0803 &  31.6496 &  1160.5800 &   618.7040 &    0.163583 \\
2 &  18.7674 &  18.0485 &    56.1875 &  17.5254 &  11.8229 &    56.8125 &    78.3127 &    0.245667 \\
3 &  62.7512 &  27.3413 &  1555.1600 &  60.9768 &  32.2978 &  1519.8200 &   699.2440 &    0.055498 \\
4 &  50.9237 &  22.7845 &   622.3520 &  67.5829 &  28.0278 &  2022.2500 &  2046.4400 &    0.005833 \\
\bottomrule
\end{tabular}

