# Data Visualization Advanced Version

In [1]:
# Import Libraries, packages, and modules
import os
import math
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import numpy as np

In [2]:
# plotly commands for jupyter notebook usage ( via Pip)
# pip install "jupyterlab>=3"
#  pip install "ipywidgets>=7.6"

# Extract target files from Working Directory

In [3]:
# Assign a working directory path
working_directory = "/Users/Daniel/Desktop/particle_analysis"

if os.path.isdir(working_directory): 
    print("Working path is a Directory")
else:
    print("Error please provide a valid Directory path.")

# Check the content of the working directory
content = os.listdir(working_directory)
print("\nObjects in the directory: \n",content)

# Isolate .csv files in directory
target_files = []
for item in content:
    if ".csv" in item: 
        target_files += [item]

# create constant with value of  target files 
COUNT = len(target_files)
print("\nThere are", COUNT, "csv filese in the directory that include: \n",target_files)


Working path is a Directory

Objects in the directory: 
 ['.DS_Store', 'Image10_processedimage.png', 'Image10_results.csv', 'particles_resultsx14.csv', 'Ti64_lot232_results1.csv', 'Ti64_lot232_results2.csv']

There are 4 csv filese in the directory that include: 
 ['Image10_results.csv', 'particles_resultsx14.csv', 'Ti64_lot232_results1.csv', 'Ti64_lot232_results2.csv']


# Create a Concatenated Pandas Datafame

In [8]:
# Create pandas DataFrame for each CSV file
file_key_dict = {}
for i in range( COUNT ):
    file_key_dict[target_files[i]] = i
print("\nFile key dictionary: ", file_key_dict)

frames = []
for i in range( COUNT ) :
    # Make an instance of a data frame from each file
    cur_df = pd.read_csv(working_directory + "/" + target_files[i])
    cur_df["File_key"] = np.ones( shape=(cur_df.shape[0], 1) ) 
    # Add column to current dataframe
    cur_df["File_key"] = cur_df["File_key"] * file_key_dict[target_files[i]]
    # Add this current dataframe to this list
    frames.append(cur_df)

print("check that frame elements have content:\n",
      frames[0].head(5))

concat_particle_data = pd.concat(frames)



File key dictionary:  {'Image10_results.csv': 0, 'particles_resultsx14.csv': 1, 'Ti64_lot232_results1.csv': 2, 'Ti64_lot232_results2.csv': 3}
check that frame elements have content:
        Area  Mean  StdDev  Min  Max         X        Y   Perim.    BX  ...  \
0  1    390   255       0  255  255   654.790   25.526   73.841   644  ...   
1  2    546   255       0  255  255    99.661   52.577   88.912    86  ...   
2  3   1084   255       0  255  255  1948.518  139.506  139.439  1931  ...   
3  4   3952   255       0  255  255  2074.011  206.630  245.522  2040  ...   
4  5  22959   255       0  255  255   615.006  323.702  575.973   532  ...   

   RawIntDen  Slice  FeretX  FeretY  FeretAngle  MinFeret     AR  Round  \
0      99450      1     649      36      56.976    22.000  1.059  0.945   
1     139230      1      89      43     139.185    25.811  1.091  0.917   
2     276420      1    1938     118     101.310    38.253  1.052  0.950   
3    1007760      1    2070     169      98.973

In [9]:
concat_particle_data.head()

Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,X,Y,Perim.,BX,...,FeretX,FeretY,FeretAngle,MinFeret,AR,Round,Solidity,File_key,XM,YM
0,1,390.0,255,0.0,255,255,654.79,25.526,73.841,644.0,...,649.0,36.0,56.976,22.0,1.059,0.945,0.94,0.0,,
1,2,546.0,255,0.0,255,255,99.661,52.577,88.912,86.0,...,89.0,43.0,139.185,25.811,1.091,0.917,0.938,0.0,,
2,3,1084.0,255,0.0,255,255,1948.518,139.506,139.439,1931.0,...,1938.0,118.0,101.31,38.253,1.052,0.95,0.914,0.0,,
3,4,3952.0,255,0.0,255,255,2074.011,206.63,245.522,2040.0,...,2070.0,169.0,98.973,67.0,1.125,0.889,0.971,0.0,,
4,5,22959.0,255,0.0,255,255,615.006,323.702,575.973,532.0,...,558.0,391.0,50.667,166.0,1.046,0.956,0.98,0.0,,


**Now using the "File_key" as a grouping parameter to make plots**

# Plots

### Distribution Plots

In [10]:
# Strip Plot
fig1 = px.strip(concat_particle_data, x="Area", color="File_key")
fig1.show()

In [34]:
# Histogram & Marginal "rug" plot: Area
fig2 = px.histogram(concat_particle_data, x="Area",color="File_key",
                    text_auto=True, marginal="rug",
                    )
fig2.show()

# Box (and whisker) Plots 

### Create new dataframe for plotting

In [13]:
from cum_relative_freq import cum_relative_freq

# Gather and compute relavent columns from input dataframe
filtered_data = cum_relative_freq( 
    input_dataframe= concat_particle_data )

Multiple files are in this input dataframe,
File_key column ADDED.

total_particles:  89
New cumulative percentage particle:
 [  2.24719101   3.37078652   5.61797753   6.74157303   8.98876404
  12.35955056  16.85393258  17.97752809  20.2247191   21.34831461
  25.84269663  26.96629213  28.08988764  31.46067416  33.70786517
  37.07865169  40.4494382   43.82022472  48.31460674  51.68539326
  56.17977528  56.17977528  57.30337079  57.30337079  58.42696629
  58.42696629  58.42696629  58.42696629  59.5505618   59.5505618
  60.6741573   60.6741573   61.79775281  61.79775281  61.79775281
  61.79775281  62.92134831  62.92134831  64.04494382  64.04494382
  64.04494382  64.04494382  65.16853933  65.16853933  65.16853933
  65.16853933  66.29213483  66.29213483  66.29213483  66.29213483
  67.41573034  67.41573034  67.41573034  67.41573034  68.53932584
  68.53932584  68.53932584  68.53932584  69.66292135  69.66292135
  69.66292135  69.66292135  70.78651685  71.91011236  73.03370787
  74.15730337  75

https://en.wikipedia.org/wiki/Box_plot#Variations

In [29]:
fig3 = px.box(filtered_data, 
              y="eff_diameter_microns", x="File_key",
              # emphasize median differences
              notched=True,
              # style
              color ="File_key",hover_name="Particle_number",
              points="all"
              )
fig3.show()


In [35]:
# Histogram & Marginal "rug" plot : effective diameter (microns)
fig4 = px.histogram(filtered_data, x="eff_diameter_microns",color="File_key",
                    text_auto=True, marginal="rug",
                    facet_col="File_key", # make subplots for each target_file
                    )
fig4.show()

# Relational Plots 

In [15]:
# display filtered dataframe
filtered_data

Unnamed: 0,Area,eff_radius,eff_diameter_microns,File_key,cumulative_frequency,cumulative_percentage,Particle_number
5,20.452,2.551485,5.102969,2.0,2,2.247191,1
5,20.452,2.551485,5.102969,3.0,3,3.370787,2
1,21.403,2.610132,5.220263,2.0,5,5.617978,3
1,21.403,2.610132,5.220263,3.0,6,6.741573,4
12,29.013,3.038935,6.077870,3.0,8,8.988764,5
...,...,...,...,...,...,...,...
7,22274.000,84.202342,168.404684,0.0,85,95.505618,85
4,22959.000,85.487290,170.974579,0.0,86,96.629213,86
17,28353.000,95.000212,190.000423,0.0,87,97.752809,87
23,32309.000,101.411410,202.822820,0.0,88,98.876404,88


In [16]:
filtered_data["File_key"].dtypes

dtype('float64')

In [36]:
# Treat the contious column as categorical,
#   for different symbols
filtered_data["File_key"] = filtered_data["File_key"].astype(str)

# Plot data
fig5 = px.scatter(filtered_data,
    x = "eff_diameter_microns",
    y = "cumulative_percentage",
    # style
    hover_name= "Particle_number",
    symbol="File_key", # Different symbols for diff. File_key values
    color="File_key", # Color by Fike_key
    ) 

fig5.show()

In [37]:
# View target files in folder
print("The available target files are: \n", target_files)

# create a subset dataframe for a specifc target file
key = 2
df_target_file = concat_particle_data[
    concat_particle_data["File_key"] == key
    ]

# Display selected dataframe
print("The selected target file is: \n", 
        target_files[int(key)]) 
df_target_file

The available target files are: 
 ['Image10_results.csv', 'particles_resultsx14.csv', 'Ti64_lot232_results1.csv', 'Ti64_lot232_results2.csv']
The selected target file is: 
 Ti64_lot232_results1.csv


Unnamed: 0,Unnamed: 1,Area,Mean,StdDev,Min,Max,X,Y,Perim.,BX,...,FeretX,FeretY,FeretAngle,MinFeret,AR,Round,Solidity,File_key,XM,YM
0,1,84.661,255,,255,255,,,,,...,,,,,,,,2.0,,
1,2,21.403,255,,255,255,,,,,...,,,,,,,,2.0,,
2,3,30.44,255,,255,255,,,,,...,,,,,,,,2.0,,
3,4,60.88,255,,255,255,,,,,...,,,,,,,,2.0,,
4,5,98.93,255,,255,255,,,,,...,,,,,,,,2.0,,
5,6,20.452,255,,255,255,,,,,...,,,,,,,,2.0,,
6,7,37.099,255,,255,255,,,,,...,,,,,,,,2.0,,
7,8,63.734,255,,255,255,,,,,...,,,,,,,,2.0,,
8,9,76.1,255,,255,255,,,,,...,,,,,,,,2.0,,
9,10,75.149,255,,255,255,,,,,...,,,,,,,,2.0,,


### Show plotting for specific file

In [23]:
# view cumulative percent particle of this file
filtered_df_target_file = cum_relative_freq(
    input_dataframe= df_target_file)

Multiple files are in this input dataframe,
File_key column ADDED.

total_particles:  13
New cumulative percentage particle:
 [  7.69230769  15.38461538  23.07692308  30.76923077  38.46153846
  46.15384615  53.84615385  61.53846154  69.23076923  76.92307692
  92.30769231  92.30769231 100.        ]


In [38]:
# Plot data
fig6 = px.line(filtered_df_target_file,
    x = "eff_diameter_microns",
    y = "cumulative_percentage",
    # style
    hover_name= "Particle_number",
    ) 

fig6.show()

# Next Steps

Friday August 9th, 2024
- it looks like I can call all the files from a folder and parse for csv files and then cast each one as a pandas dataframe. 
I can also give each dataframe a unique index and then put each dataframe into a larger one.
- Next step is to incorporate the cum_relative_freq function into this by ensuring I either (try and except ) or use conditional statements
to keep the column "File_key" if it does exist. 
- Then I can continue plotting the data

Monday August 13th, 2024

**Accomplished**
*  it looks like I have succesfully implemented try and except, to ensure that if a "File_key" column exist, it is kept in the returned dataframe in cum_relative_frequency
    * This allowed me to make cumulative percentage vs diameter plots
* I have been able to make plots based upon multiple Analyze particle csv files\

**Next Steps**
1. Look at making plots that show equivalent diameter, sphericity, size, and other parameters.
2. Try to copy graphs that are seen in flowcam - VisualSpreadSheet
3. Look at exporting data columns that Analyze particle spits out in VisualSpreadSheet!!