
# Created by: Xuanzhi Li
## Contact Information: xli@seattleu.edu

### Instruction:
The code file provided contains visualizations for the project. In the event of any issues with displaying the plots, an HTML file has also been included. This allows you to view the code and plots without having to run the code yourself. 

### Import libraries and dataset

In [None]:
#Install the library if running it for the first time. Remove "#" to uncomment the below command.

#pip install altair-catplot

In [None]:
import pandas as pd
import numpy as npkik34

import matplotlib.pyplot as plt
import seaborn as sns 
sns.set_style("whitegrid")

import altair as alt
from altair import datum
import altair_catplot as altcat
import altair_transform

In [None]:
df = pd.read_csv("Model_Data.csv")
df

Unnamed: 0,SpecimenAgeTested,SpecimenMeasuredStrength,StructureType,SiteTemperature,BatchSpecimenSize,BatchTemperature,BatchUnitWeight,BatchRequiredStrength,StructureNumberLevels,StructureHeight,SiteLattitude,SiteLongitude,BatchHourMolded,BatchMinutesMolded
0,7,9860,Residential,79.0,4X8,89.0,149.60,8000,34,418.375,36.161455,-86.784800,12,53
1,28,10360,Residential,79.0,4X8,89.0,149.60,8000,34,418.375,36.161455,-86.784800,12,53
2,28,10300,Residential,79.0,4X8,89.0,149.60,8000,34,418.375,36.161455,-86.784800,12,53
3,28,10140,Residential,79.0,4X8,89.0,149.60,8000,34,418.375,36.161455,-86.784800,12,53
4,1,4880,Residential,70.0,4X8,80.0,150.90,7200,34,418.375,36.161455,-86.784800,4,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843,28,4940,Office,44.0,4x8,76.0,152.22,4000,2,32.000,41.429539,-73.403626,9,37
2844,7,3710,Office,44.0,4x8,73.0,153.93,4000,2,32.000,41.429539,-73.403626,10,37
2845,28,5100,Office,44.0,4x8,73.0,153.93,4000,2,32.000,41.429539,-73.403626,10,37
2846,28,5320,Office,44.0,4x8,73.0,153.93,4000,2,32.000,41.429539,-73.403626,10,37


In [None]:
#select the most influential varaibles
topVars = df[["SpecimenMeasuredStrength", "SpecimenAgeTested",
                   "SiteTemperature", "BatchUnitWeight", 
                   "BatchRequiredStrength", "BatchTemperature"]]
topVars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2848 entries, 0 to 2847
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SpecimenMeasuredStrength  2848 non-null   int64  
 1   SpecimenAgeTested         2848 non-null   int64  
 2   SiteTemperature           2848 non-null   float64
 3   BatchUnitWeight           2848 non-null   float64
 4   BatchRequiredStrength     2848 non-null   int64  
 5   BatchTemperature          2848 non-null   float64
dtypes: float64(3), int64(3)
memory usage: 133.6 KB


# Data Cleaning

### 1. Batch required strength

In [None]:
topVars['BatchRequiredStrength'].unique()

array([ 8000,  7200, 10000,  6500, 12000,  6000,  5000,  4000,  4500,
        1000,  3000,  4400])

In [None]:
topVars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2848 entries, 0 to 2847
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SpecimenMeasuredStrength  2848 non-null   int64  
 1   SpecimenAgeTested         2848 non-null   int64  
 2   SiteTemperature           2848 non-null   float64
 3   BatchUnitWeight           2848 non-null   float64
 4   BatchRequiredStrength     2848 non-null   int64  
 5   BatchTemperature          2848 non-null   float64
dtypes: float64(3), int64(3)
memory usage: 133.6 KB


In [None]:
topVars.groupby(['BatchRequiredStrength'])['BatchRequiredStrength'].count()

BatchRequiredStrength
1000        1
3000       79
4000      342
4400        4
4500        2
5000      258
6000       23
6500      103
7200     1006
8000       58
10000     833
12000     139
Name: BatchRequiredStrength, dtype: int64

In [None]:
#Get rid of 1000 one since the minimium requried strength is 2500
topVars = topVars[topVars.BatchRequiredStrength >= 2500]

In [None]:
topVars.groupby(['BatchRequiredStrength'])['BatchRequiredStrength'].count()

BatchRequiredStrength
3000       79
4000      342
4400        4
4500        2
5000      258
6000       23
6500      103
7200     1006
8000       58
10000     833
12000     139
Name: BatchRequiredStrength, dtype: int64

In [None]:
RS_bins = [2500, 5000, 7000, 10000, 12000]
RS_binslabels = ["2500-5000", "5001-7000", "7001-10000", "10001-12000"]
#includes the upper boundaries
topVars['RS_binned'] = pd.cut(topVars['BatchRequiredStrength'], 
                              bins=RS_bins, 
                              labels=RS_binslabels)

topVars

Unnamed: 0,SpecimenMeasuredStrength,SpecimenAgeTested,SiteTemperature,BatchUnitWeight,BatchRequiredStrength,BatchTemperature,RS_binned
0,9860,7,79.0,149.60,8000,89.0,7001-10000
1,10360,28,79.0,149.60,8000,89.0,7001-10000
2,10300,28,79.0,149.60,8000,89.0,7001-10000
3,10140,28,79.0,149.60,8000,89.0,7001-10000
4,4880,1,70.0,150.90,7200,80.0,7001-10000
...,...,...,...,...,...,...,...
2843,4940,28,44.0,152.22,4000,76.0,2500-5000
2844,3710,7,44.0,153.93,4000,73.0,2500-5000
2845,5100,28,44.0,153.93,4000,73.0,2500-5000
2846,5320,28,44.0,153.93,4000,73.0,2500-5000


### 2. New variables: Measured to Required Strength Ratio

In [None]:
topVars["MRratio"] = (
    topVars["SpecimenMeasuredStrength"] / topVars["BatchRequiredStrength"]
)
topVars

Unnamed: 0,SpecimenMeasuredStrength,SpecimenAgeTested,SiteTemperature,BatchUnitWeight,BatchRequiredStrength,BatchTemperature,RS_binned,MRratio
0,9860,7,79.0,149.60,8000,89.0,7001-10000,1.232500
1,10360,28,79.0,149.60,8000,89.0,7001-10000,1.295000
2,10300,28,79.0,149.60,8000,89.0,7001-10000,1.287500
3,10140,28,79.0,149.60,8000,89.0,7001-10000,1.267500
4,4880,1,70.0,150.90,7200,80.0,7001-10000,0.677778
...,...,...,...,...,...,...,...,...
2843,4940,28,44.0,152.22,4000,76.0,2500-5000,1.235000
2844,3710,7,44.0,153.93,4000,73.0,2500-5000,0.927500
2845,5100,28,44.0,153.93,4000,73.0,2500-5000,1.275000
2846,5320,28,44.0,153.93,4000,73.0,2500-5000,1.330000


In [None]:
sorted(topVars['MRratio'].unique(), reverse=True)

[2.906,
 2.892,
 2.654,
 2.566,
 2.564,
 2.522,
 2.478,
 2.47,
 2.358,
 2.256,
 2.2,
 2.1766666666666667,
 2.172,
 2.16,
 2.134,
 2.1,
 2.036,
 2.0,
 1.986,
 1.9125,
 1.91,
 1.8925,
 1.876,
 1.866,
 1.854,
 1.8,
 1.7775,
 1.768,
 1.7475,
 1.726,
 1.72,
 1.6933333333333334,
 1.678,
 1.676,
 1.67,
 1.664,
 1.6583333333333334,
 1.658,
 1.6444444444444444,
 1.638,
 1.614,
 1.6133333333333333,
 1.61,
 1.59,
 1.5866666666666667,
 1.5766666666666667,
 1.5763888888888888,
 1.558,
 1.55,
 1.546,
 1.544,
 1.5433333333333332,
 1.54,
 1.537,
 1.53,
 1.5233333333333334,
 1.523,
 1.521,
 1.516,
 1.5133333333333334,
 1.51,
 1.5066666666666666,
 1.504,
 1.5,
 1.497,
 1.4933333333333334,
 1.49,
 1.489,
 1.488,
 1.4875,
 1.4866666666666666,
 1.484,
 1.4833333333333334,
 1.483,
 1.4825,
 1.48,
 1.4791666666666667,
 1.476388888888889,
 1.476,
 1.475,
 1.474,
 1.473,
 1.4725,
 1.472,
 1.47,
 1.468,
 1.4652777777777777,
 1.4633333333333334,
 1.461,
 1.46,
 1.458,
 1.4566666666666668,
 1.454,
 1.453,
 1.4525

### 3. Site Temperature

In [None]:
sorted(topVars['SiteTemperature'].unique())

[24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 888.0]

In [None]:
#get rid of impossible values
topVars = topVars[topVars['SiteTemperature'] <= 100]

ST_bins = [0, 40, 60, 80, 100]
ST_binslabels = ["0-40", "40-60", "60-80", "80-100"]
topVars['ST_binned'] = pd.cut(topVars['SiteTemperature'], 
                              bins=ST_bins, 
                              labels=ST_binslabels)

topVars

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topVars['ST_binned'] = pd.cut(topVars['SiteTemperature'],


Unnamed: 0,SpecimenMeasuredStrength,SpecimenAgeTested,SiteTemperature,BatchUnitWeight,BatchRequiredStrength,BatchTemperature,RS_binned,MRratio,ST_binned
0,9860,7,79.0,149.60,8000,89.0,7001-10000,1.232500,60-80
1,10360,28,79.0,149.60,8000,89.0,7001-10000,1.295000,60-80
2,10300,28,79.0,149.60,8000,89.0,7001-10000,1.287500,60-80
3,10140,28,79.0,149.60,8000,89.0,7001-10000,1.267500,60-80
4,4880,1,70.0,150.90,7200,80.0,7001-10000,0.677778,60-80
...,...,...,...,...,...,...,...,...,...
2843,4940,28,44.0,152.22,4000,76.0,2500-5000,1.235000,40-60
2844,3710,7,44.0,153.93,4000,73.0,2500-5000,0.927500,40-60
2845,5100,28,44.0,153.93,4000,73.0,2500-5000,1.275000,40-60
2846,5320,28,44.0,153.93,4000,73.0,2500-5000,1.330000,40-60


### 4. Batch Temperature

In [None]:
sorted(topVars['BatchTemperature'].unique())

[7.4,
 41.0,
 45.0,
 50.0,
 52.0,
 55.0,
 56.0,
 57.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0]

In [None]:
BT_bins = [40, 60, 80, 100]
BT_binslabels = ["40-60", "60-80", "80-100"]
topVars['BT_binned'] = pd.cut(topVars['BatchTemperature'], 
                              bins=BT_bins, 
                              labels=BT_binslabels)

topVars

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topVars['BT_binned'] = pd.cut(topVars['BatchTemperature'],


Unnamed: 0,SpecimenMeasuredStrength,SpecimenAgeTested,SiteTemperature,BatchUnitWeight,BatchRequiredStrength,BatchTemperature,RS_binned,MRratio,ST_binned,BT_binned
0,9860,7,79.0,149.60,8000,89.0,7001-10000,1.232500,60-80,80-100
1,10360,28,79.0,149.60,8000,89.0,7001-10000,1.295000,60-80,80-100
2,10300,28,79.0,149.60,8000,89.0,7001-10000,1.287500,60-80,80-100
3,10140,28,79.0,149.60,8000,89.0,7001-10000,1.267500,60-80,80-100
4,4880,1,70.0,150.90,7200,80.0,7001-10000,0.677778,60-80,60-80
...,...,...,...,...,...,...,...,...,...,...
2843,4940,28,44.0,152.22,4000,76.0,2500-5000,1.235000,40-60,60-80
2844,3710,7,44.0,153.93,4000,73.0,2500-5000,0.927500,40-60,60-80
2845,5100,28,44.0,153.93,4000,73.0,2500-5000,1.275000,40-60,60-80
2846,5320,28,44.0,153.93,4000,73.0,2500-5000,1.330000,40-60,60-80


### 5. Age Days

In [None]:
topVars.groupby(['SpecimenAgeTested'])['SpecimenAgeTested'].count()

SpecimenAgeTested
1      167
2       97
3       57
4       21
5       19
7      678
8        1
14       3
25       1
28    1201
33       3
56     589
90       7
Name: SpecimenAgeTested, dtype: int64

In [None]:
#0-7, 7-14, 14-28, 28-56, 56 and more, include upper interval
AD_bins1 = [0, 7, 14, 28, 56, 90]
AD_binslabels1 = ["0-7", "7-14", "14-28", "28-56", "56-90"]
topVars['AD_bins1'] = pd.cut(topVars['SpecimenAgeTested'], 
                              bins=AD_bins1, 
                              labels=AD_binslabels1)


#0-14, 14-28, 28-56, 56 and more, include upper interval
AD_bins2 = [0, 14, 28, 90]
AD_binslabels2 = ["0-14", "15-28", "29-90"]
topVars['AD_bins2'] = pd.cut(topVars['SpecimenAgeTested'], 
                              bins=AD_bins2, 
                              labels=AD_binslabels2)

topVars

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topVars['AD_bins1'] = pd.cut(topVars['SpecimenAgeTested'],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topVars['AD_bins2'] = pd.cut(topVars['SpecimenAgeTested'],


Unnamed: 0,SpecimenMeasuredStrength,SpecimenAgeTested,SiteTemperature,BatchUnitWeight,BatchRequiredStrength,BatchTemperature,RS_binned,MRratio,ST_binned,BT_binned,AD_bins1,AD_bins2
0,9860,7,79.0,149.60,8000,89.0,7001-10000,1.232500,60-80,80-100,0-7,0-14
1,10360,28,79.0,149.60,8000,89.0,7001-10000,1.295000,60-80,80-100,14-28,15-28
2,10300,28,79.0,149.60,8000,89.0,7001-10000,1.287500,60-80,80-100,14-28,15-28
3,10140,28,79.0,149.60,8000,89.0,7001-10000,1.267500,60-80,80-100,14-28,15-28
4,4880,1,70.0,150.90,7200,80.0,7001-10000,0.677778,60-80,60-80,0-7,0-14
...,...,...,...,...,...,...,...,...,...,...,...,...
2843,4940,28,44.0,152.22,4000,76.0,2500-5000,1.235000,40-60,60-80,14-28,15-28
2844,3710,7,44.0,153.93,4000,73.0,2500-5000,0.927500,40-60,60-80,0-7,0-14
2845,5100,28,44.0,153.93,4000,73.0,2500-5000,1.275000,40-60,60-80,14-28,15-28
2846,5320,28,44.0,153.93,4000,73.0,2500-5000,1.330000,40-60,60-80,14-28,15-28


# Visualizations

### Plot1: Scatter plot of ratio vs age day with error bars

In [None]:
line = alt.Chart(pd.DataFrame({'y': [1]})).mark_rule(strokeDash=[5,3],opacity=0.3).encode(y='y',)

In [None]:
error_bars_rs5 = (alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '2500-5000'
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='SpecimenAgeTested:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(MRratio)',
    stdev = 'stdev(MRratio)',
    groupby=['SpecimenAgeTested']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)

In [None]:
means_rs5 = alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '2500-5000'
).mark_tick(
    size = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='SpecimenAgeTested:Q',
    y='mean(MRratio):Q'
)

In [None]:
base_ra_rs5 = alt.Chart(topVars,
          title="2500-5000 PSI"
).transform_filter(
    datum.RS_binned == '2500-5000'
).mark_tick(
).encode(
    x=alt.X('SpecimenAgeTested:Q', title = 'Specimen Age (Days)'),
    y=alt.Y('MRratio:Q', title = 'Measured to Required Strength Ratio')
)

In [None]:
ra_rs5_plot = error_bars_rs5 + means_rs5 + base_ra_rs5.transform_regression(
    'SpecimenAgeTested',
    'MRratio',
    method = "log"
).mark_line(size=1.7) + line

In [None]:
error_bars_rs7 = (alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '5001-7000'
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='SpecimenAgeTested:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(MRratio)',
    stdev = 'stdev(MRratio)',
    groupby=['SpecimenAgeTested']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)


In [None]:
means_rs7 = alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '5001-7000'
).mark_tick(
    size = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='SpecimenAgeTested:Q',
    y='mean(MRratio)'
)

In [None]:
base_ra_rs7 = alt.Chart(topVars,
          title="5001-7000 PSI"
).transform_filter(
    datum.RS_binned == '5001-7000'
).mark_circle(
).encode(
    x=alt.X('SpecimenAgeTested:Q', title = 'Specimen Age (Days)'),
    y=alt.Y('MRratio:Q', title = None)
)

In [None]:
ra_rs7_plot = error_bars_rs7 + means_rs7 + base_ra_rs7.transform_regression(
    'SpecimenAgeTested',
    'MRratio',
    method = "log"
).mark_line(size=1.7) + line

In [None]:
error_bars_rs10 = (alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '7001-10000'
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='SpecimenAgeTested:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(MRratio)',
    stdev = 'stdev(MRratio)',
    groupby=['SpecimenAgeTested']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)

In [None]:
means_rs10 = alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '7001-10000'
).mark_tick(
    size = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='SpecimenAgeTested:Q',
    y='mean(MRratio)'
)

In [None]:
base_ra_rs10 = alt.Chart(topVars,
          title="7001-10000 PSI"
).transform_filter(
    datum.RS_binned == '7001-10000'
).mark_circle(
).encode(
    x=alt.X('SpecimenAgeTested:Q', title = 'Specimen Age (Days)'),
    y=alt.Y('MRratio:Q', title = None)
)

In [None]:
ra_rs10_plot= error_bars_rs10 + means_rs10 + base_ra_rs10.transform_regression(
    'SpecimenAgeTested',
    'MRratio',
    method = "log"
).mark_line(size=1.7) + line

In [None]:
error_bars_rs12 = (alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '10001-12000'
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='SpecimenAgeTested:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(MRratio)',
    stdev = 'stdev(MRratio)',
    groupby=['SpecimenAgeTested']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)

In [None]:
means_rs12 = alt.Chart(
    topVars
).transform_filter(
    datum.RS_binned == '10001-12000'
).mark_tick(
    size = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='SpecimenAgeTested:Q',
    y='mean(MRratio)'
)

In [None]:
base_ra_rs12 = alt.Chart(topVars,
          title="10001-12000 PSI"
).transform_filter(
    datum.RS_binned == '10001-12000'
).mark_circle(
).encode(
    x=alt.X('SpecimenAgeTested:Q', title = 'Specimen Age (Days)'),
    y=alt.Y('MRratio:Q', title = None)
)


ra_rs12_plot = error_bars_rs12 + means_rs12 + base_ra_rs12.transform_regression(
    'SpecimenAgeTested',
    'MRratio',
    method = "log"
).mark_line(size=1.7) + line

In [None]:
alt.hconcat(
    ra_rs5_plot,
    ra_rs7_plot, 
    ra_rs10_plot, 
    ra_rs12_plot
).resolve_scale(
    x='shared', 
    y='shared'
).configure_axis(
    titleFontSize=16
).configure_title(
    fontSize=18
).configure_axisY(
    titleAlign="center",
    titleX=-30,
)

Here we provide an overall picture of how the specimens are cured over time. We plot the measured strength to required strength ratio vs Specimen Age, grouped by different ranges of Required Strength. The curves represent the logarithmic regression line, which is the standard relationship between these two features.  The blue ticks represent the mean ratio with each required strength, and the bars show one standard deviation from the means.

Our observation is that specimens with required strength lower than and equal to 10 thousand behave similarly, with most of them meeting the required strength in the first 28 days. In contrast, specimens with required strength higher than 10,000 psi need at least 56 days to meet the required strength. However, in order to meet the required of this ratio exceed 1.3, this plot would suggest giving more days for concrete to cure. The majority of specimens with required strength lower than and equal to 10,000 psi doesn't meet this 1.3 requirement within the first 28 days. Also, the ones with required strength larger than 10,000 psi doesn't meet this requirement on the 56th days.


### Poster version

In [None]:
order= ["2500-5000", "5001-7000", "7001-10000", "10001-12000"]
base_d_p = alt.Chart(topVars,
          title="Measured to Required Strength Ratio vs Specimen Age"
).transform_calculate(
  order=f"-indexof({order}, datum.RS_binned)"
).mark_circle(
    clip=True
).encode(
    x=alt.X('SpecimenAgeTested:Q', title = 'Specimen Age (Days)'),
    y=alt.Y('MRratio:Q', title = 'Measured to Required Strength Ratio').scale(domain=(0.4, 1.4)),
    color = alt.Color('RS_binned:N', 
                      sort=order,
                      legend=alt.Legend(
                          orient='none',
                          legendX=240, 
                          legendY=240,
                          titleAnchor='middle'
                      )
                     ).title("Required Strength (PSI)"),
    order="order:Q"
)

In [None]:
linev = alt.Chart(pd.DataFrame({'y': [1.3]})).mark_rule(strokeDash=[5,3],
                                                        opacity=0.5).encode(y='y',)

In [None]:
lineh28 = alt.Chart(pd.DataFrame({'x': [28]})).mark_rule(strokeDash=[5,3],
                                                        opacity=0.5).encode(x='x',)

lineh56 = alt.Chart(pd.DataFrame({'x': [56]})).mark_rule(strokeDash=[5,3],
                                                        opacity=0.5).encode(x='x',)

In [None]:
poster=base_d_p.transform_regression(
    'SpecimenAgeTested',
    'MRratio',
    method = "log",
    groupby=['RS_binned']
).mark_line(size=3) + line + linev + lineh28 + lineh56

In [None]:
chart=poster.configure_axis(
    labelFontSize=20,
    titleFontSize=20
).configure_legend(
    titleFontSize=14,
    labelFontSize=14
).configure_title(
    fontSize=24
).properties(
    width=400,
    height=400
).configure_axisY(
    titleAlign="center",
    titleX=-100,
)

In [None]:
chart

This plot help comparing the logarithmic regressions amonge different required strength ranges. It provides the same insight as the previous plot.

### Plot2:  Required vs Measured Strength for 28 Days

In [None]:
base_mr7 = alt.Chart(topVars,
          title="On the 7th Day"
).transform_filter(
    datum.SpecimenAgeTested == 7
).mark_circle(
    opacity = 0.3
).encode(
    x=alt.X('BatchRequiredStrength:Q', title = 'Required Strength (PSI)'),
    y=alt.Y('SpecimenMeasuredStrength:Q', title = 'Measured Strength (PSI)')
)

error_bars_7 = (alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 7
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='BatchRequiredStrength:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(SpecimenMeasuredStrength)',
    stdev = 'stdev(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)
        
means_7 = alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 7
).transform_aggregate(
    count='count()',
    mean = 'mean(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).mark_tick(
    orient = "horizontal",
    width = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='BatchRequiredStrength:Q',
    y= 'mean:Q'
)


xyline = pd.DataFrame({
    'dx': [0, 16000],
    'dy': [0, 16000],
})

xyline_plot = alt.Chart(
    xyline
).mark_line(
    color= 'black',
    opacity=0.3,
    size=1.5,
    strokeDash=[5,3]
).encode(
    x= 'dx',
    y= 'dy',
)


mr7_plot = error_bars_7 + means_7 + base_mr7.transform_regression(
    'BatchRequiredStrength', 
    'SpecimenMeasuredStrength'
).mark_line(size=2, opacity = 0.8, color='black') + xyline_plot

In [None]:
base_mr28 = alt.Chart(topVars,
          title="On the 28th Day"
).transform_filter(
    datum.SpecimenAgeTested == 28
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x=alt.X('BatchRequiredStrength:Q', title = 'Required Strength (PSI)'),
    y=alt.Y('SpecimenMeasuredStrength:Q', title = None)
)

error_bars_28 = (alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 28
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='BatchRequiredStrength:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(SpecimenMeasuredStrength)',
    stdev = 'stdev(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)
        
means_28 = alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 28
).transform_aggregate(
    count='count()',
    mean = 'mean(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).mark_tick(
    orient = "horizontal",
    width = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='BatchRequiredStrength:Q',
    y= 'mean:Q'
)

mr28_plot = error_bars_28 + means_28 + base_mr28.transform_regression(
    'BatchRequiredStrength', 
    'SpecimenMeasuredStrength'
).mark_line(size=2, opacity = 0.8, color='black') + xyline_plot


In [None]:
base_mr56 = alt.Chart(topVars,
          title="On the 56th Day"
).transform_filter(
    datum.SpecimenAgeTested == 56
).mark_circle(
    opacity = 0.3
).encode(
    x=alt.X('BatchRequiredStrength:Q', title = 'Required Strength (PSI)'),
    y=alt.Y('SpecimenMeasuredStrength:Q', title = None)
)

error_bars_56 = (alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 56
).mark_rule(
    size = 10,
    opacity = 1,
    color = 'lightblue'
).encode(
    x='BatchRequiredStrength:Q',
    y='error_lower:Q',
    y2='error_upper:Q'
).transform_aggregate(
    mean = 'mean(SpecimenMeasuredStrength)',
    stdev = 'stdev(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')
)
        
means_56 = alt.Chart(
    topVars
).transform_filter(
    datum.SpecimenAgeTested == 56
).transform_aggregate(
    count='count()',
    mean = 'mean(SpecimenMeasuredStrength)',
    groupby=['BatchRequiredStrength']
).mark_tick(
    orient = "horizontal",
    width = 15,
    thickness = 3,
    opacity = 0.8
).encode(
    x='BatchRequiredStrength:Q',
    y= 'mean:Q'
)

mr56_plot = error_bars_56 + means_56 + base_mr56.transform_regression(
    'BatchRequiredStrength', 
    'SpecimenMeasuredStrength'
).mark_line(size=2, opacity = 0.8, color='black') + xyline_plot

In [None]:
mr7_plot | mr28_plot | mr56_plot

Since the 7th, 28th, and 56th days are standard days to test and have the most data points, we have closely studied the measured strength vs required strength on these days. The linear regression lines indicate the trend, while the blue ticks represent the mean strength with each required strength, and the bars show one standard deviation from the means.

Our visualization reveals that only specimens with very low required strength could meet the required strength on the 7th day. Most specimens would meet the required strength on the 28th day, except for those with the required strength of 12 thousand. The specimens continue to cure and attain higher strength, and we can expect all of them, regardless of their required strength, to exceed the required strength on the 56th day.


### Plot3: Ratio vs Batch Temperature Colored by Binned Age Days

In [None]:
error_bars_st28 = alt.Chart(
    topVars
).transform_calculate(
  order=f"-indexof({order}, datum.RS_binned)"
).transform_filter(
    datum.SpecimenAgeTested == 28
).mark_errorband(
).encode(
    x='BatchTemperature:Q',
    y='error_lower:Q',
    y2='error_upper:Q',
    order="order:Q"
).transform_aggregate(
    mean = 'mean(MRratio)',
    stdev = 'stdev(MRratio)'
).transform_calculate(
    error_lower = 'datum.mean - 1 * datum.stdev',
    error_upper = 'datum.mean + 1 * datum.stdev')



base_BT_AD14 = alt.Chart(topVars,
          title="0-14 Days"
).transform_filter(
    datum.AD_bins2 == '0-14'
).transform_filter(
    datum.BatchTemperature >=40
).mark_circle(
    opacity = 0.3,
    clip=True
).encode(
    x=alt.X('BatchTemperature:Q', title = 'BatchTemperature(F)').scale(domain=(40, 100)),
    y=alt.Y('MRratio:Q', title = 'Measured to Required Strength Ratio').scale(domain=(0, 3.0)),
    color = alt.Color('RS_binned:N', sort=order)
)

BT_AD14_plot =base_BT_AD14 + base_BT_AD14.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7)


In [None]:
base_BT_AD28 = alt.Chart(topVars,
          title="15-28 Days"
).transform_filter(
    datum.AD_bins2 == '15-28'
).transform_filter(
    datum.BatchTemperature >=40
).mark_circle(
    opacity = 0.3,
    clip=True
).encode(
    x=alt.X('BatchTemperature:Q', title = 'BatchTemperature(F)').scale(domain=(40, 100)),
    y=alt.Y('MRratio:Q', title = None).scale(domain=(0, 3.0)),
    color = alt.Color('RS_binned:N', sort=order)
)

BT_AD28_plot = base_BT_AD28 + base_BT_AD28.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7)

In [None]:
base_BT_AD90 = alt.Chart(topVars,
          title="29-90 Days"
).transform_filter(
    datum.AD_bins2 == '29-90'
).transform_filter(
    datum.BatchTemperature >=40
).mark_circle(
    opacity = 0.3,
    clip=True
).encode(
    x=alt.X('BatchTemperature:Q', title = 'BatchTemperature(F)').scale(domain=(40, 100)),
    y=alt.Y('MRratio:Q', title = None).scale(domain=(0, 3.0)),
    color = alt.Color('RS_binned:N', 
                      sort=order,
                      legend=alt.Legend(
                          orient='none',
                          legendX=10, 
                          legendY=20,
                          titleAnchor='middle'
                      )
                     ).title("Required Strength (PSI)")
)

BT_AD90_plot = base_BT_AD90 + base_BT_AD90.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7)

In [None]:
(base_BT_AD14.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7) | (base_BT_AD28.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7)) | (base_BT_AD90.transform_regression(
    'BatchTemperature', 
    'MRratio',
    groupby = ['RS_binned']
).mark_line(size=2, opacity=0.7)))

We then include the third influential feature, the batch temperature, which is the temperature of the specimens themselves. We can observe that the batch temperature has a positive relationship with the ratio in the first 14 days, except for specimens with required strength higher than 10 thousand psi. After that, the batch temperature starts to have a negative relationship with the ratio except for those with required strength between 5 and 7 thousand. 

We learned that the batch temperature is currently being manipulated by adding hot water or ice, but this could potentially have a negative influence on the measured strength. Because of the missing data on water added, this project cannot study relationships and give advice on when to decrease or increase the batch temperature. But we would strongly recommend collecting the information about water added in the future, so that future studies and discover the relationship among water added, batch temperature, and measured strength. Revealing this relationship could provide suggestions for a better curing process in order to achaive higher measured strength within the same days.
