# Outline of data analysis with Pandas:
***


1. Importing data (from FIJI results table or any other source)
*** 
2. Previewing Data
    * Head and tail
    * Data validation
    * Statistics of data set
        * *Min, max, standard deviation, etc.*
***    
3. Working with Data
    * loc vs iloc
    * Selecting columns
        * *Statistics on column*
    * Adding data to data frame
    * Performing operations on a column
***
4. Quick plotting for data preview
***

### Importing data
Python has built in functions for importing csv files, which we can use here to read the data in as lists
***

In [1]:
# Read the csv file the using Python built in functions
import csv
areas_list = []  #initialize an empty list to put the row numbers in
with open('Results.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        #print(row)
        #print(row[0])
        #print(row[0],row[1],row[2],)
        areas_list.append(row[1])

In [2]:
#look at just the areas
areas_list[1:-1]


['0.067',
 '0.075',
 '0.062',
 '0.064',
 '0.079',
 '0.072',
 '0.067',
 '0.075',
 '0.025',
 '0.084',
 '0.129',
 '0.037',
 '0.046',
 '0.067',
 '0.079',
 '0.097',
 '0.121',
 '0.061',
 '0.100',
 '0.067',
 '0.067',
 '0.075',
 '0.062',
 '0.064',
 '0.079',
 '0.072',
 '0.067',
 '0.075',
 '0.025',
 '0.084',
 '0.129',
 '0.037',
 '0.046',
 '0.067',
 '0.079',
 '0.097',
 '0.121',
 '0.061',
 '0.100',
 '0.067',
 '0.067',
 '0.075',
 '0.062',
 '0.064',
 '0.079',
 '0.072',
 '0.067',
 '0.075',
 '0.025',
 '0.084',
 '0.129',
 '0.037',
 '0.046',
 '0.067',
 '0.079',
 '0.097',
 '0.121',
 '0.061',
 '0.100',
 '0.067',
 '0.067',
 '0.075',
 '0.062',
 '0.064',
 '0.079',
 '0.072',
 '0.067',
 '0.075',
 '0.025',
 '0.084',
 '0.129',
 '0.037',
 '0.046',
 '0.067',
 '0.079',
 '0.097',
 '0.121',
 '0.061',
 '0.100',
 '0.067',
 '0.067',
 '0.075',
 '0.062',
 '0.064',
 '0.079',
 '0.072',
 '0.067',
 '0.075',
 '0.025',
 '0.084',
 '0.129',
 '0.037',
 '0.046',
 '0.067',
 '0.079',
 '0.097',
 '0.121',
 '0.061',
 '0.100']

In [3]:
#Calculate the max of just the areas 
areas = list(map(float, areas_list[1:-1]))
max(areas)


0.129

In [4]:
#Calculate the min and standard deviation of the areas

In [5]:
min(areas)

0.025

In [6]:
from statistics import stdev
stdev(areas)

0.024481491669084105

### We can easily create a dataframe to hold the imported csv data from a file using Pandas and numpy
***

In [7]:
#Input csv file as dataframe
import pandas as pd
import numpy as np

#Read in data as data frame
df_old= pd.read_csv('Results.csv',skiprows=range(21,101))  #range 
df_old.head()

Unnamed: 0,Unnamed: 1,Area,Min,Max
0,1,0.067,255,255
1,2,0.075,255,255
2,3,0.062,255,255
3,4,0.064,255,255
4,5,0.079,255,255


### When we import the dataframe, it generates an index to the rows, so we can select them. 
 What does setting the *index_col=0* flag do?
 ***

In [8]:
df = pd.read_csv('Results.csv', skiprows=range(21,101), index_col = 0)
#Whats the difference between the two indicies?
df.head()

Unnamed: 0,Area,Min,Max
,,,
1.0,0.067,255.0,255.0
2.0,0.075,255.0,255.0
3.0,0.062,255.0,255.0
4.0,0.064,255.0,255.0
5.0,0.079,255.0,255.0


## We can find out information about the dataframe as a whole using pandas functions:
***

In [9]:
df.info()  #find out information about the whole dataset, (i.e. how many objects there are in each column)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 1 to 20
Data columns (total 3 columns):
Area    20 non-null float64
Min     20 non-null int64
Max     20 non-null int64
dtypes: float64(1), int64(2)
memory usage: 640.0 bytes


In [10]:
df.shape  #find the number of rows and columns

(20, 3)

In [11]:
#Find the min, max, mean area of the dataset
df.min()


Area      0.025
Min     255.000
Max     255.000
dtype: float64

In [12]:
df.max()


Area      0.129
Min     255.000
Max     255.000
dtype: float64

In [13]:
df.mean()

Area      0.0737
Min     255.0000
Max     255.0000
dtype: float64

In [14]:
#Find the standard deviation of the dataset
df.std()

Area    0.024875
Min     0.000000
Max     0.000000
dtype: float64

In [15]:
df.describe()  #find out all the stats in one go 

Unnamed: 0,Area,Min,Max
count,20.0,20.0,20.0
mean,0.0737,255.0,255.0
std,0.024875,0.0,0.0
min,0.025,255.0,255.0
25%,0.0635,255.0,255.0
50%,0.0695,255.0,255.0
75%,0.08025,255.0,255.0
max,0.129,255.0,255.0


### Showing a single row or single value: loc vs iloc
***


In [16]:
df.head()

Unnamed: 0,Area,Min,Max
,,,
1.0,0.067,255.0,255.0
2.0,0.075,255.0,255.0
3.0,0.062,255.0,255.0
4.0,0.064,255.0,255.0
5.0,0.079,255.0,255.0


In [17]:
df.loc[1,:]  #loc looks for a particular label on an index

Area      0.067
Min     255.000
Max     255.000
Name: 1, dtype: float64

In [18]:
df.iloc[1,:] #iloc is integer loc, looks at the specific POSITION of the index.

Area      0.075
Min     255.000
Max     255.000
Name: 2, dtype: float64

## Selecting a single column using indexing
* Single and double square bracket indexing
***

In [19]:
df['Area'].head()  #view the first few rows of area

 
1    0.067
2    0.075
3    0.062
4    0.064
5    0.079
Name: Area, dtype: float64

In [20]:
df[['Area']].head()

Unnamed: 0,Area
,
1.0,0.067
2.0,0.075
3.0,0.062
4.0,0.064
5.0,0.079


## Working with the data, statistics of a column

In [21]:
df[['Area']].describe()

Unnamed: 0,Area
count,20.0
mean,0.0737
std,0.024875
min,0.025
25%,0.0635
50%,0.0695
75%,0.08025
max,0.129


In [22]:
df.Area.describe()

count    20.000000
mean      0.073700
std       0.024875
min       0.025000
25%       0.063500
50%       0.069500
75%       0.080250
max       0.129000
Name: Area, dtype: float64

In [23]:
df[['Area']].min()  #Finding the column min using a dataframe approach

Area    0.025
dtype: float64

In [24]:
df['Area'].min()  #Finding the column min using a series approach

0.025

In [25]:
#Find the min, max and mean of the area column, and set them equal to area_max, area_min and area_mean and make an 
#output print statement saying what they are

area_min = df['Area'].min()
area_max = df['Area'].max()
area_mean = df['Area'].mean()

print('The minimum area of a found circle is:', area_min, 'square pixels')
print('The maximum area of a found circle is:', area_max, 'square pixels')
print('The mean area of a found circle is:', area_mean, 'square pixels')


The minimum area of a found circle is: 0.025 square pixels
The maximum area of a found circle is: 0.129 square pixels
The mean area of a found circle is: 0.0737 square pixels


### Formatting numbers

You can round numbers to the a decimal point using round(number, num_of_decimals)
you can format a number using format spec, https://pyformat.info/  
"{:4.1f}" gives us at least four characters, with one after the decimal point.  


In [26]:
#Rounding example
test_number = 1234.56789  #generate a test number
round(test_number,3)   #round test number

1234.568

In [27]:
"{:1.1f}".format(test_number)  #if we change the number before the decimal point, we end up with more white space

'1234.6'

In [28]:
# Format the numbers to significant figures
print('The minimum area of a found circle is:', round(area_min,2), 'square pixels')
print('The maximum area of a found circle is:', "{:1.2f}".format(area_max), 'square pixels')
print('The mean area of a found circle is:', round(area_mean,4), 'square pixels')

The minimum area of a found circle is: 0.02 square pixels
The maximum area of a found circle is: 0.13 square pixels
The mean area of a found circle is: 0.0737 square pixels


## Calculations on a column
### Can we find the radii of the shapes?
***

In [30]:
#Can we find the radii of the shapes?
#recall area=pi*r^2
#r=sqrt(area/pi)

radius = np.sqrt(df['Area']/np.pi)
radius.head()

 
1    0.146037
2    0.154510
3    0.140482
4    0.142730
5    0.158576
Name: Area, dtype: float64

In [32]:
df['radius'] = radius*100
df.head()

Unnamed: 0,Area,Min,Max,radius
,,,,
1.0,0.067,255.0,255.0,14.603685
2.0,0.075,255.0,255.0,15.450968
3.0,0.062,255.0,255.0,14.048207
4.0,0.064,255.0,255.0,14.272993
5.0,0.079,255.0,255.0,15.857642


### Removing columns from dataframe
***

In [33]:
#Delete the column for radius?
df = df.drop(columns="radius")
df.head()

Unnamed: 0,Area,Min,Max
,,,
1.0,0.067,255.0,255.0
2.0,0.075,255.0,255.0
3.0,0.062,255.0,255.0
4.0,0.064,255.0,255.0
5.0,0.079,255.0,255.0


In [34]:
#Can we add the radius to the data frame directly?
df[['radii']] = np.sqrt(df[['Area']]/np.pi)*100
df.head()

Unnamed: 0,Area,Min,Max,radii
,,,,
1.0,0.067,255.0,255.0,14.603685
2.0,0.075,255.0,255.0,15.450968
3.0,0.062,255.0,255.0,14.048207
4.0,0.064,255.0,255.0,14.272993
5.0,0.079,255.0,255.0,15.857642


In [35]:
#Excercise, compute the diameter and add a diameter column to the data frame
df['diameter']  = df['radii']*2
df.head()

Unnamed: 0,Area,Min,Max,radii,diameter
,,,,,
1.0,0.067,255.0,255.0,14.603685,29.207371
2.0,0.075,255.0,255.0,15.450968,30.901936
3.0,0.062,255.0,255.0,14.048207,28.096415
4.0,0.064,255.0,255.0,14.272993,28.545986
5.0,0.079,255.0,255.0,15.857642,31.715284


In [36]:
#Find the mean and standard deviation of the radii, compare to the inputs
mean_radius = df['radii'].mean()
print('The mean radius is:', round(mean_radius,3), 'pixels')


The mean radius is: 15.098 pixels


### Use the same syntax to find just the standard deviation of the radii
***

In [44]:
radii_std = df['radii'].std()
print('The standard deviation of the radii is:', "{:.4}".format(radii_std), 'pixels' )
radii_std

The standard deviation of the radii is: 2.647 pixels


2.6466151313324913

In [45]:
print('The standard deviation of the radii is:', round(radii_std, 3), 'pixels')
radii_std

The standard deviation of the radii is: 2.647 pixels


2.6466151313324913

# Quick plot for seeing what we have
***

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


In [None]:
df.plot()

In [None]:
df.plot(y='radii')

In [None]:
df.radii.plot(kind='hist', rwidth=0.8)