# Outline of data analysis with Pandas:
***


1. Importing data (from FIJI results table or any other source)
*** 
2. Previewing Data
    * Head and tail
    * Data validation
    * Statistics of data set
        * *Min, max, standard deviation, etc.*
***    
3. Working with Data
    * loc vs iloc
    * Selecting columns
        * *Statistics on column*
    * Adding data to data frame
    * Performing operations on a column
***
4. Quick plotting for data preview
***

### Importing data
Python has built in functions for importing csv files, which we can use here to read the data in as lists
***

In [None]:
# Read the csv file the using Python built in functions
import csv
areas_list = []  #initialize an empty list to put the row numbers in
with open('Results_random_circles_CoronaTime.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        #print(row)
        #print(row[0])
        #print(row[0],row[1],row[2],)
        areas_list.append(row[1])

In [None]:
#look at just the areas.  How do we slice to get rid of the first entry?
areas_list[1:]


In [None]:
#The list of areas is a list of strings, we need to change it to a list of floating point numbers
areas = list(map(float, areas_list[1:]))

#Calculate the max of just the areas 
max(areas)


#### Excerise

Calculate the min of the areas

---

In [None]:
min(areas)

#### Use the statistics package to determine the standard deviation of the areas

In [None]:
from statistics import stdev
stdev(areas)

### We can easily create a dataframe to hold the imported csv data from a file using Pandas and numpy instead of reading in using csv read, just a shortcut to get a dataframe
***

In [None]:
#Input csv file as dataframe
import pandas as pd
import numpy as np

#Read in data as data frame
df_old= pd.read_csv('Results_random_circles_CoronaTime.csv',skiprows=range(21,101))  #range to skip
df_old.head()  #head shows just the first five entries of the dataframe

### When we import the dataframe, it generates an index to the rows, so we can select them. 
 What does setting the *index_col=0* flag do?  What if we change it to other columns?
 ***

In [None]:
df = pd.read_csv('Results_random_circles_CoronaTime.csv', skiprows=range(21,101), index_col = 0)
#Whats the difference between the two indicies?
df.head()

## We can find out information about the dataframe as a whole using pandas functions:
***

In [None]:
df.info()  #find out information about the whole dataset, (i.e. how many objects there are in each column)

In [None]:
df.shape  #find the number of rows and columns

### Excercises, find the min, max and standard deviation of the areas of the dataframe columns

In [None]:
#Find the min, max, mean area of the dataset
df.min()


In [None]:
#Find the max
df.max()


In [None]:
#Find the mean
df.mean()

In [None]:
#Find the standard deviation of the dataset
df.std()

In [None]:
df.describe()  #find out all the stats in one go 

### Showing a single row or single value: loc vs iloc
***


In [None]:
df.head()

In [None]:
df.loc[1,:]  #loc looks for a particular label on an index

In [None]:
df.iloc[1,:] #iloc is integer loc, looks at the specific POSITION of the index.

## Selecting a single column using indexing
* Single and double square bracket indexing
***

In [None]:
df['Area'].head()  #view the first few rows of area, comes out as a series

In [None]:
df[['Area']].head()

In [None]:
df.Area.head()

## Working with the data, statistics of a column

In [None]:
df[['Area']].describe()

In [None]:
df[['Area']]

In [None]:
df.Area.describe()


In [None]:
df.Area.count()

In [None]:
df[['Area']].min()  #Finding the column min using a dataframe approach

In [None]:
df['Area'].min()  #Finding the column min using a series approach

### Excercise: Find the min, max and mean of the area column, and set them equal to area_max, area_min and area_mean and make an output print statement saying what they are

In [None]:
#Find the min, max and mean of the area column, and set them equal to area_max, area_min and area_mean and make an 
#output print statement saying what they are

area_min = df['Area'].min()
area_max = df['Area'].max()
area_mean = df['Area'].mean()

print('The minimum area of a found circle is:', area_min, 'square pixels')
print('The maximum area of a found circle is:', area_max, 'square pixels')
print('The mean area of a found circle is:', area_mean, 'square pixels')


### Formatting numbers

You can format a number using format spec, https://pyformat.info/  
"{:4.1f}" gives us at least four characters, with one after the decimal point.  



In [None]:
#Rounding example
test_number = 1234.56789  #generate a test number


In [None]:
"{:1.1f}".format(test_number)  #if we change the number before the decimal point, we end up with more white space

In [None]:
# Format the numbers to significant figures
print('The minimum area of a found circle is:',"{:1.2f}".format(area_min), 'square pixels')
print('The maximum area of a found circle is:', "{:1.2f}".format(area_max), 'square pixels')
print('The mean area of a found circle is:', "{:1.4f}".format(area_mean), 'square pixels')

## Calculations on a column
### Can we find the radii of the shapes?
***

In [None]:
#Can we find the radii of the shapes?
#recall area=pi*r^2
#r=sqrt(area/pi)

# This makes a new list of just the radii

radius = np.sqrt(df['Area']/np.pi)
radius.head()

#### Perform a mathematical operation and add a column to a dataframe

In [None]:
df['radius_100'] = radius*100
df.head()

### Removing columns from dataframe
***

In [None]:
#Delete the column for radius?
df = df.drop(columns="radius_100")
df.head()

In [None]:
df.drop(columns='radii')

In [None]:
#Can we add the radius to the data frame directly?
df[['Radii']] = np.sqrt(df[['Area']]/np.pi)
df.head()

### Excercise, compute the diameter and add a diameter column to the data frame
***

In [None]:
#Excercise, compute the diameter and add a diameter column to the data frame
df['diameter']  = df['radii']*2
df.head()

### Excercise, find the mean and standard deviation of the radii.  How do the compare to what we put in to our sample image?
***

In [None]:
#Find the mean and standard deviation of the radii, compare to the inputs
mean_radius = df['radii'].mean()
print('The mean radius is:', "{:1.3}".format(mean_radius), 'pixels')


### Use the same syntax to find just the standard deviation of the radii
***

In [None]:
radii_std = df['radii'].std()
print('The standard deviation of the radii is:', "{:.4}".format(radii_std), 'pixels' )
radii_std

In [None]:
print('The standard deviation of the radii is:', round(radii_std, 3), 'pixels')
radii_std

# Quick plot for seeing what we have
***

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


In [None]:
df.plot()

In [None]:
df.plot(y='Radii')

In [None]:
df.Radii.plot(kind='hist', rwidth=0.8)

In [None]:
df['Radii'].plot(kind='hist', rwidth=0.8)

In [None]:
df.plot('Radii')