# Data Visualization Advanced Version

In [1]:
# Import Libraries, packages, and modules
import os
import math
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import numpy as np
# from cum_relative_freq import cum_relative_freq
from frequency_operations import *
from diameter_operations import add_eff_diameter
from sphericity_operations import add_sphericity

In [2]:
# plotly commands for jupyter notebook usage ( via Pip)
# pip install "jupyterlab>=3"
#  pip install "ipywidgets>=7.6"

# Extract target files from Working Directory

In [3]:
# Assign a working directory path
working_directory = "/Users/Daniel/Desktop/particle_analysis/pyimagej_results/Ozan_results_csv"

if os.path.isdir(working_directory): 
    print("Working path is a Directory")
else:
    print("Error please provide a valid Directory path.")

# Check the content of the working directory
content = os.listdir(working_directory)
print("\nObjects in the directory: \n",content)

# Isolate .csv files in directory
target_files = []
for item in content:
    if ".csv" in item: 
        target_files += [item]

# create constant with value of  target files 
COUNT = len(target_files)
print("\nThere are", COUNT, "csv files in the directory that include: \n",target_files)


Working path is a Directory

Objects in the directory: 
 ['Image10_results.csv']

There are 1 csv files in the directory that include: 
 ['Image10_results.csv']


# Create a Concatenated Pandas Datafame

In [4]:
# Create pandas DataFrame for each CSV file
file_key_dict = {}
for i in range( COUNT ):
    file_key_dict[target_files[i]] = i
print("\nFile key dictionary: ", file_key_dict)

frames = []
for i in range( COUNT ) :
    # Make an instance of a data frame from each file
    cur_df = pd.read_csv(working_directory + "/" + target_files[i])
    cur_df["File_key"] = np.ones( shape=(cur_df.shape[0], 1) ) 
    # Add column to current dataframe
    cur_df["File_key"] = cur_df["File_key"] * file_key_dict[target_files[i]]
    # Add this current dataframe to this list
    frames.append(cur_df)

print("check that frame elements have content:\n",
      frames[0].head(5))

concat_particle_data = pd.concat(frames)



File key dictionary:  {'Image10_results.csv': 0}
check that frame elements have content:
        Area  Mean  StdDev  Min  Max         X        Y   Perim.    BX  ...  \
0  1    390   255       0  255  255   654.790   25.526   73.841   644  ...   
1  2    546   255       0  255  255    99.661   52.577   88.912    86  ...   
2  3   1084   255       0  255  255  1948.518  139.506  139.439  1931  ...   
3  4   3952   255       0  255  255  2074.011  206.630  245.522  2040  ...   
4  5  22959   255       0  255  255   615.006  323.702  575.973   532  ...   

   RawIntDen  Slice  FeretX  FeretY  FeretAngle  MinFeret     AR  Round  \
0      99450      1     649      36      56.976    22.000  1.059  0.945   
1     139230      1      89      43     139.185    25.811  1.091  0.917   
2     276420      1    1938     118     101.310    38.253  1.052  0.950   
3    1007760      1    2070     169      98.973    67.000  1.125  0.889   
4    5854545      1     558     391      50.667   166.000  1.046 

In [5]:
# concat_particle_data.head()
concat_particle_data.dtypes

                int64
Area            int64
Mean            int64
StdDev          int64
Min             int64
Max             int64
X             float64
Y             float64
Perim.        float64
BX              int64
BY              int64
Width           int64
Height          int64
Major         float64
Minor         float64
Angle         float64
Circ.         float64
Feret         float64
IntDen          int64
Median          int64
Skew          float64
Kurt          float64
%Area           int64
RawIntDen       int64
Slice           int64
FeretX          int64
FeretY          int64
FeretAngle    float64
MinFeret      float64
AR            float64
Round         float64
Solidity      float64
File_key      float64
dtype: object

## Create a subset dataframe for plotting

### Effective Diameter

In [7]:
# Add effective diameter column
diameter_data = add_eff_diameter( 
    df=concat_particle_data,
    area_column_name="Area",
    perimeter_column_name= "Perim.",
    unit="pixels")
print("\nAdded Effective Diameter column\n")

# Determine cumulative frequency for Eff. Diameter  
dia_data = add_cumulative_frequency(
    df= diameter_data,
    target_column_name="eff_diameter_pixels",
    suffix="dia"
)
print("\nAdded Diamter-cumulative frequency column\n")

# Determine cumulative percentage for effective diameter
diameter_data = add_cumulative_percentage(
    df= diameter_data,
    target_column_name="eff_diameter_pixels",
    column_suffix="dia"
)
print("\nAdded Diamter-cumulative percentage column\n")

diameter_data.head()


Multiple files are in this input dataframe,
File_key column ADDED.

Added Effective Diameter column

Target column exist.

5      16.312788
24     17.589690
18     17.769733
21     18.575465
6      18.982276
26     19.931150
0      22.283703
12     24.669992
1      26.366433
19     27.174969
16     27.221782
10     27.960134
25     33.167438
14     34.888637
2      37.150931
15     55.428576
8      65.708107
20     65.727481
3      70.935483
9      93.246731
22    100.343302
11    139.891715
7     168.404684
4     170.974579
17    190.000423
23    202.822820
13    235.761049
Name: eff_diameter_pixels, dtype: float64

total_particles:  27

Added Diamter-cumulative frequency column

Target column exist.

5      16.312788
24     17.589690
18     17.769733
21     18.575465
6      18.982276
26     19.931150
0      22.283703
12     24.669992
1      26.366433
19     27.174969
16     27.221782
10     27.960134
25     33.167438
14     34.888637
2      37.150931
15     55.428576
8      65.708107

Unnamed: 0,Area_pixels^2,Perim_pixels,eff_diameter_pixels,eff_radius_pixels,File_key,Particle_number,cumulative_frequency_dia,cumulative_%_particle_dia
5,209,53.113,16.312788,8.156394,0.0,1,1,3.703704
24,243,57.355,17.58969,8.794845,0.0,2,2,7.407407
18,248,57.113,17.769733,8.884866,0.0,3,3,11.111111
21,271,59.941,18.575465,9.287733,0.0,4,4,14.814815
6,283,60.77,18.982276,9.491138,0.0,5,5,18.518519


### Sphericity

In [8]:
# Add effective diameter column
sph_data = add_sphericity(
    df=concat_particle_data,
    area_column_name="Area",
    perim_column_name="Perim.")
print("\nAdded Sphericity column\n")

# Determine cumulative frequency for Eff. Diameter  
sph_data = add_cumulative_frequency(
    df= sph_data,
    target_column_name="Sphericity",
    suffix="sph"
)
print("\nAdded sphericity-cumulative frequency column\n")

# Determine cumulative percentage for effective diameter
sph_data = add_cumulative_percentage(
    df= sph_data,
    target_column_name="Sphericity",
    column_suffix="sph"
)
print("\nAdded sphericity-cumulative percentage column\n")

sph_data.head()


Added Sphericity column

Target column exist.

8     0.808188
2     0.837019
9     0.842990
13    0.847439
26    0.893449
3     0.907660
12    0.915365
15    0.928814
1     0.931624
4     0.932565
11    0.936091
14    0.941082
22    0.941236
25    0.943247
7     0.947457
0     0.948068
20    0.948402
23    0.949026
17    0.950184
10    0.957481
19    0.960193
24    0.963467
5     0.964889
16    0.968228
21    0.973566
18    0.977453
6     0.981316
Name: Sphericity, dtype: float64

total_particles:  27

Added sphericity-cumulative frequency column

Target column exist.

8     0.808188
2     0.837019
9     0.842990
13    0.847439
26    0.893449
3     0.907660
12    0.915365
15    0.928814
1     0.931624
4     0.932565
11    0.936091
14    0.941082
22    0.941236
25    0.943247
7     0.947457
0     0.948068
20    0.948402
23    0.949026
17    0.950184
10    0.957481
19    0.960193
24    0.963467
5     0.964889
16    0.968228
21    0.973566
18    0.977453
6     0.981316
Name: Sphericity, 

Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,X,Y,Perim.,BX,...,FeretAngle,MinFeret,AR,Round,Solidity,File_key,Sphericity,Particle_number,cumulative_frequency_sph,cumulative_%_particle_sph
8,9,3391,255,0,255,255,177.1,545.629,255.421,136,...,150.751,58.45,1.478,0.677,0.924,0.0,0.808188,1,1,3.703704
2,3,1084,255,0,255,255,1948.518,139.506,139.439,1931,...,101.31,38.253,1.052,0.95,0.914,0.0,0.837019,2,2,7.407407
9,10,6829,255,0,255,255,157.787,636.848,347.505,111,...,37.659,87.279,1.343,0.745,0.921,0.0,0.84299,3,3,11.111111
13,14,43655,255,0,255,255,197.942,1083.482,874.004,85,...,107.659,228.211,1.143,0.875,0.946,0.0,0.847439,4,4,14.814815
26,27,312,255,0,255,255,553.654,1876.676,70.083,542,...,165.379,20.5,1.059,0.945,0.886,0.0,0.893449,5,5,18.518519


### Create a dataframe that contains outlier particles 

In [10]:

# Declare size thresholds
max_size = 14   # pixels
min_size = 6   # pixels

# parse filered dataframe for rows that match criteria,
#   Create new dataframe.
outliers_df = pd.concat(
    [ 
    dia_data[ dia_data["eff_diameter_pixels"] > max_size ],
    dia_data[ dia_data["eff_diameter_pixels"] < min_size ]
    ])

# Display the outliers dataframe 
outliers_df

Unnamed: 0,Area_pixels^2,Perim_pixels,eff_diameter_pixels,eff_radius_pixels,File_key,Particle_number,cumulative_frequency_dia,cumulative_%_particle_dia
5,209,53.113,16.312788,8.156394,0.0,1,1,3.703704
24,243,57.355,17.58969,8.794845,0.0,2,2,7.407407
18,248,57.113,17.769733,8.884866,0.0,3,3,11.111111
21,271,59.941,18.575465,9.287733,0.0,4,4,14.814815
6,283,60.77,18.982276,9.491138,0.0,5,5,18.518519
26,312,70.083,19.93115,9.965575,0.0,6,6,22.222222
0,390,73.841,22.283703,11.141852,0.0,7,7,25.925926
12,478,84.669,24.669992,12.334996,0.0,8,8,29.62963
1,546,88.912,26.366433,13.183217,0.0,9,9,33.333333
19,580,88.912,27.174969,13.587484,0.0,10,10,37.037037


**Now using the "File_key" as a grouping parameter to make plots**

# Plots

### Distribution Plots

In [22]:
# Strip Plot
fig1 = px.strip(dia_data,
                 x="eff_diameter_pixels", 
                 color="File_key",
                 title="Strip plot of Effective Diameter (pixels)", )
fig1.show()

In [26]:
# Histogram & Marginal "rug" plot: Area
fig2 = px.histogram(dia_data, x="Area_pixels^2",color="File_key",
                    text_auto=True, marginal="rug",
                    title="Histogram of paticle Area (pixels^2)", 
                    )
fig2.show()

## Box (and whisker) Plots 

https://en.wikipedia.org/wiki/Box_plot#Variations

In [24]:
fig3 = px.box(dia_data, 
              y="eff_diameter_pixels", x="File_key",
              # emphasize median differences
              notched=False,
              # style
              color ="File_key",hover_name="Particle_number",
              points="all",
              title="Boxplot of Effective Diameter (pixels)", 
              )
fig3.show()


In [25]:
# Histogram & Marginal "rug" plot : effective diameter (microns)
fig4 = px.histogram(dia_data, x="eff_diameter_pixels",color="File_key",
                    text_auto=True, marginal="rug",
                    facet_col="File_key", # make subplots for each target_file
                    title="Histogtam & Marginal Rug: Effective Diameter (pixels)", 
                    )
fig4.show()

# Relational Plots 

### Concatenated file plotes

In [27]:
fig_sph = px.scatter(sph_data, 
                     x="Sphericity", 
                     y="cumulative_%_particle_sph",
                     title="Cumulative Percentage (particle) vs Sphericity" )
fig_sph.show()

In [28]:
fig_dia = px.scatter(dia_data,
                     x="eff_diameter_pixels", 
                     y="cumulative_%_particle_dia",
                     title="Cumulative Percentage (partice) vs Effective Diameter (µm)", )
fig_dia.show()

In [40]:
concat_particle_data["File_key"].dtypes

dtype('float64')

#### Create a Cumulative Percentage plot with different files symboled and colored 

In [31]:
# Treat the contious column as categorical,
#   for different symbols
dia_data["File_key"] = dia_data["File_key"].astype(str)

# Plot data
fig5 = px.scatter(dia_data,
    x = "eff_diameter_pixels",
    y = "cumulative_%_particle_dia",
    # style
    hover_name= "Particle_number",
    symbol="File_key", # Different symbols for diff. File_key values
    color="File_key", # Color by Fike_key
    ) 

fig5.show()

## Individual file plots

In [30]:
# View target files in folder
print("The available target files are: \n", target_files)

# Try creating subset dataframe for a specifc target file
try:
    key = 1
    df_target_file = concat_particle_data[ concat_particle_data["File_key"] == key ]
    print("The selected target file is: \n", target_files[int(key)]) 
except IndexError:
    print(f"An IndexError has occured: {IndexError}.",
          "\nThere are only {COUNT} csv files in the working directory.")
else:
    # Display selected dataframe
    df_target_file.head()

The available target files are: 
 ['Image10_results.csv']
An IndexError has occured: <class 'IndexError'>. 
There are only {COUNT} csv files in the working directory.


### Show plotting for specific file

# Next Steps

Friday August 9th, 2024
- it looks like I can call all the files from a folder and parse for csv files and then cast each one as a pandas dataframe. 
I can also give each dataframe a unique index and then put each dataframe into a larger one.
- Next step is to incorporate the cum_relative_freq function into this by ensuring I either (try and except ) or use conditional statements
to keep the column "File_key" if it does exist. 
- Then I can continue plotting the data

Monday August 13th, 2024

**Accomplished**
*  it looks like I have succesfully implemented try and except, to ensure that if a "File_key" column exist, it is kept in the returned dataframe in cum_relative_frequency
    * This allowed me to make cumulative percentage vs diameter plots
* I have been able to make plots based upon multiple Analyze particle csv files\

**Next Steps**
1. Look at making plots that show equivalent diameter, sphericity, size, and other parameters.
2. Try to copy graphs that are seen in flowcam - VisualSpreadSheet
3. Look at exporting data columns that Analyze particle spits out in VisualSpreadSheet!!

Tuesday August 14th, 2024

**Accomplished**
1. Made modular scripts that contain functions related to each quantity (diamter, sphericity, cumulative frequency, cumulative percentage)  

**Next Steps**
1. Fix the sphericity function, it appears that it can find  Area and %Area, but the order appears to be always Area first. Maybe force a first column extraction to elimnate error.
2. Make plots for wednesday's meeting