# Read Files

press the >> symbol above to begin or restart. finally press restart and run all cells.

Alternatively. click on the first cell, or "Read Files". and keep pressing "> Run"

In [1]:
testing = False

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
if testing == False:
    from IPython.core.display import display, HTML
    display(HTML(r"""<style id=hide>div.input{display:none;}</style><button type="button"onclick="var myStyle = document.getElementById('hide').sheet;myStyle.insertRule('div.input{display:inherit !important;}', 0);">Show inputs</button>"""))



In [4]:
import os
import numpy as np
import pandas as pd
import msoffcrypto 
import io 
import seaborn as sns
from datetime import datetime
import plotly.express as px
%matplotlib inline


In [5]:

cwd = os.getcwd()
if cwd[-10:] != "ExcelFiles":
    os.chdir("ExcelFiles")
print("local Excel Files: \n ")
for subdir, dirs, files in os.walk('./'):
    for file in files:
        if file[-4:] == "xlsx":
            print (file)

local Excel Files: 
 
CleanedRawDairyData.xlsx
FinalOutput.xlsx
Gallon Per Cow Example.xlsx
Hans Dairy Data 7-19-21.xlsx
July- Gallon Per Cow.xlsx
testdairydata - Copy2.xlsx
testdairydata.xlsx


In [6]:
#example implementation https://mybinder.org/v2/gh/SteveGrey759/MilkOScan/HEAD?filepath=MilkOScan%20V2.ipynb

In [7]:
#What global variables may be desired for future manipulation of this data?
#data grabbing:
#relevant sheet #, 
#data cleaning: 
#just general oversight over how cleaning pipeline process is going, with warnings signs for dramatic errors
#merging data:
#derived columns/frames:
#is days ago really a worthy column? 
#aggregates may take some interesting arguments
#graphs: 
#date ranges, minInclusion stats, graph types, etc
#display variables: "would you like data cleaning process displayed? raw data analytics? etc"


In [8]:
dairyDataScanned = False
RawDairySheetName = "Raw Dairy (V#)"

In [9]:

while not dairyDataScanned:
    if testing == False:
        fileName = input("Please enter the DairyData's Excel files full name: ")
    else:
        fileName = "Hans Dairy Data 7-19-21.xlsx"
    fileName = fileName.strip()
    if testing == False:
        password = input("Please enter the Excel password for DairyData: ")
    else:
        password = "replacethistext"

    try:
        file = msoffcrypto.OfficeFile(open(fileName, "rb"))
        file.load_key(password=password) # Use password
        decrypted = io.BytesIO() 
        file.decrypt(decrypted)
        RawDairy = pd.read_excel(decrypted, sheet_name=RawDairySheetName)
        dairyDataScanned = True
        break
    except Exception as e: 
        print(e)
        print("Please try again, or restart")

KeyboardInterrupt: Interrupted by user

In [None]:
#make all columns lowercase, and strip unecessary whitespaces
RawDairy.columns= RawDairy.columns.str.lower()
RawDairy.columns= RawDairy.columns.str.strip()
print("Found columns: ")
RawDairy.columns

In [None]:
#example of how to drop unnecessary columns
#MilkoscanPage2 = MilkoscanPage2.drop(columns = ['comments'])
#MilkoscanPage2.columns

In [None]:
RawDairyInit = RawDairy # a un-cleaned version of rawdairy for reference, might not work

# Cleaning raw dairy data: 
### Possible errors: 
* incorrect category (defatted, non V sampleid), 
* slightly incorrect entries (sampleid, date), inconsistent capitalization
* outliers. (explicitly stated, or undetected)
* repeated entries (accidentally entering same sample id twice, or systematically entering wrong sample IDs for a whole batch (off by one error) 
* Flagging potential errors for later analysis (e.g. two entries are unreasonably similar)

Delete Null/outlier values:
* Some columns have missing entries, or text such as "#N/A" or "Outlier"
* after converting those entries into a standard null value, we can display the number of nulls per column

In [None]:
RawDairy = RawDairy.replace('#N/A',np.nan)
RawDairy = RawDairy.replace('Outlier',np.nan)
print("Number of explicit null values by column")
RawDairy.isnull().sum()

### Delete all null entries from a specific column

In [None]:
def dropNullsFromColumn(df, column):
    df = df[df[column].notna()]
    return df

In [None]:
#dropNullsFromColumn(df = MilkoscanPage2, column = "moisture, %").isnull().sum() #it seems to work. 
acceptibleNullColumns = ["comments","exampletext"]

for column in RawDairy.columns:
    if column not in acceptibleNullColumns:
        RawDairy = dropNullsFromColumn(df = RawDairy, column = column)
print("Null Entries in specific categories have been removed: ")
RawDairy.isnull().sum() 

### convert columns with small errors to correct type. delete outliers

In [None]:
def coercedateTime(df, columnName):
    preNull = df[columnName].isnull().values.sum()
    dateTime = pd.to_datetime(df[columnName], errors = "coerce")
    #test.describe()
    print("coercing analysis date/time to datetime format has caused", dateTime.isnull().values.sum()-preNull ,"Number of null values")
    return dateTime
#test.isnull().values.sum()
RawDairy['analysis date/time'] = coercedateTime(df = RawDairy, columnName = 'analysis date/time')





In [None]:
def coerceNumericOrString(df, columnsToNumeric):
    #takes a list of column names to convert to numeric, converts all other object type columns to lowercase string
    for column in df.columns:
        if column in columnsToNumeric:
            print(column, "converted to float")
            df[column] = pd.to_numeric(df[column], errors='coerce', downcast='float')
        elif df[column].dtypes != "O":
            pass
            #already coerced into timestamp or other object
        else:
            df[column] = df[column].astype(str)
            df[column] = df[column].str.strip()
            df[column] = df[column].str.lower()
    return df
RawDairy = coerceNumericOrString(df = RawDairy, columnsToNumeric = ['igg, as-is %', "moisture, %", "igg, dry base %"])

Nulls created by coercing to correct type:

In [None]:
RawDairy.isnull().sum() 

In [None]:
acceptibleNullColumns = ["comments","exampletext"]
print("After Coercing igg, moisture, and dry igg%, to integers, these are the anomalous entries for deletion: ")
for column in RawDairy.columns:
    if column not in acceptibleNullColumns and RawDairy[column].isnull().sum() :
        print(RawDairyInit.iloc[RawDairy[RawDairy[column].isnull()].index])




### Again, drop all unacceptable nulls

In [None]:
for column in RawDairy.columns:
    if column not in ["comments","exampletext"]:
        RawDairy = dropNullsFromColumn(df = RawDairy, column = column)
RawDairy.isnull().sum() 

### Delete incorrect category

#### removes all entries that have a sample id not starting with a V. Also shifts entries longer than 5 to a new column

In [None]:
MinimumSampleIdLength = 5
MaximumSampleIdLength = 6

def filterNonRaw(x):
    #removes all entries that have a sample id not starting with a V. or a sampleID shorter than MinimumSampleIdLength
    if x[0].lower() != "v":
        return np.nan
    if len(x) < MinimumSampleIdLength:
        return np.nan
    return x[:MaximumSampleIdLength-1]
result = [filterNonRaw(x) for x in RawDairy['sample id']]
#RawDairy['sample id'] = result

In [None]:
#a sample of wrong ID samples
printThisMany= 10

print("there are:", RawDairy[RawDairy['sample id'] != result].shape[0], "possible misclassifications")
RawDairy[RawDairy['sample id'] != result].head(printThisMany)

In [None]:
RawDairy['sample id'] = result

In [None]:
print(RawDairy['sample id'].isnull().sum(), "samples have invalid sample IDs")

In [None]:
RawDairy = dropNullsFromColumn(df = RawDairy, column = 'sample id')
#deleted entries with invalid sample IDs

### delete duplicate entries

In [None]:
RawDairy = RawDairy.sort_values(by=['analysis date/time'])
capacity = 5
suspiciousMinutes = 10 #minutes
popList = [] #[[name, date], [name, date]] #pop out an entry after it has exceeded a timelimit
duplicates = [] #[[412, 413],[442,443] ....]

for row, index in zip(RawDairy.itertuples(), range(len(RawDairy))) :
    date = row[6]
    name = row[8]
    iggAsIs = row[10]
    #print(row) #Pandas(Index=291, _1=Timestamp('2021-04-01 11:28:00'), _2='AZ43101083', _3=2.240000009536743, _4=80.08999633789062, _5=11.25, _6='DELBERT DENURE', _7=100.0, Region='WI')
    for recent in popList:
        if name == recent[0]: 
            if abs((recent[1] - date).total_seconds()) < 60*suspiciousMinutes: 
                duplicates.append(index)
                break
    else:
        popList.append([name, date]) 
    if len(popList) > capacity:
        popList = popList[1:]


print("there are ", len(duplicates), "probable duplicates in the data, these will be removed")
print(duplicates[:20])
#print("here are a sample of duplicates")
RawDairy.head(10)

In [None]:
RawDairy = RawDairy.drop(RawDairy.index[duplicates])

### add derived columns: to be implemented later

*adding seconds/days ago column, adding error flags column, igg standard deviation, sample count...

In [None]:
#adding: potential integer column: errorFlags, If a single entry triggers multiple error flags as analysis progresses, we may #want to delete it. column value is initiated at 0.

In [None]:
def dateTime_to_seconds_from_today(x):
    return (datetime.now() - x).total_seconds()
    

In [None]:
result = [dateTime_to_seconds_from_today(x) for x in RawDairy['analysis date/time']]

In [None]:
RawDairy["seconds ago"] = result
#RawDairy["days ago"] = result

In [None]:
RawDairy["days ago"] = RawDairy["seconds ago"]/(60*60*24)

In [None]:
frequenciesDairyNames = RawDairy['dairy name'].value_counts()
frequenciesDairyNames = frequenciesDairyNames.reset_index()
frequenciesDairyNames.columns = ["dairy name","sample count"]
#frequenciesDairyNames

In [None]:
merger = pd.merge(frequenciesDairyNames, RawDairy)
# merger.shape

In [None]:
RawDairy = merger

In [None]:
iggStandardDev = RawDairy.groupby("dairy name")["igg, as-is %"].agg(['std'])
#iggStandardDev

In [None]:
iggStandardDev = iggStandardDev.reset_index()
iggStandardDev
iggStandardDev.columns = ["dairy name","igg standard dev"]

In [None]:
merger = pd.merge(iggStandardDev, RawDairy)
#merger

In [None]:
RawDairy = merger

In [None]:
RawDairy["error flags"] = 0 #E.g. if we average out potential mix ups, these may be revealed to diverge largely from other data
#which may be considered a second flag, allowing us to remove the datapoint entirely, 

In [None]:
#NOTE: it is more proper to add counts/STD after grouping?  
#the file size isnt exactly an issue, missing values isnt really one either 

## importing and merging gallon per cow

In [None]:
#This does not become useful until we have aggregated samples based off of sampleID

In [None]:
cwd = os.getcwd()
if cwd[-10:] != "ExcelFiles":
    os.chdir("ExcelFiles")
print("local Excel Files: \n ")
for subdir, dirs, files in os.walk('./'):
    for file in files:
        if file[-4:] == "xlsx":
            print (file)

In [None]:
gallonsPerCowScanned = False
gallonsPerCowScannedSheetName = "MachineFriendly"

In [None]:
while not gallonsPerCowScanned:
    if testing == False:
        fileName = input("Please enter the GallonsPerCow Excel files full name: ")
    else: 
        fileName = "Gallon Per Cow Example.xlsx"
    fileName = fileName.strip()
    if testing == False:
        password = input("Please enter the Excel password for GallonPerCow: ")
    else: 
        password = "replacethistext"

    try:
        file = msoffcrypto.OfficeFile(open(fileName, "rb"))
        file.load_key(password=password) # Use password
        decrypted = io.BytesIO() 
        file.decrypt(decrypted)
        gallonsPerCow = pd.read_excel(decrypted, sheet_name=gallonsPerCowScannedSheetName)
        gallonsPerCowScanned = True
        break
    except Exception as e: 
        print(e)
        print("Please try again, or restart")

In [None]:
gallonsPerCow.columns= gallonsPerCow.columns.str.lower()
gallonsPerCow.columns= gallonsPerCow.columns.str.strip()
print("Found columns: ")
gallonsPerCow.columns

In [None]:
gallonsPerCow['date(month)'] = coercedateTime(df = gallonsPerCow, columnName = 'date(month)')

In [None]:
gallonsPerCow = coerceNumericOrString(df = gallonsPerCow, columnsToNumeric = ['total cow herd', 'total gallons'])

In [None]:
for column in gallonsPerCow.columns:
    if column not in []:
        gallonsPerCow = dropNullsFromColumn(df = gallonsPerCow, column = column)
#gallonsPerCow.isnull().sum() 

all nulls dropped 

Gallon per cow example has some fake values appended for testing. namely the last 4 entries

In [None]:
gallonsPerCow.tail(8)

assume linear change in total gallons between measurements? apply to rawdairy samples as a new column, estimated gallons per cow. (also make linear estimator flat at both ends of the data.
also add total cow herd according to this linear estimator. finally add region and dairy name

This is a TODO item, until method is verified by sharon/dave. for now just do something simple, like groupby average

In [None]:
#referenceGroup = gallonsPerCow.groupby(["vendor code"]).mean()
#referenceGroup

In [None]:
#referenceGroup.loc[referenceGroup.index == "v1545"]["total cow herd"][0]

In [None]:
#referenceGroup = gallonsPerCow.groupby(["vendor code"]).mean()
#def retrieveHerdAndGallons(x, referenceGroup):
#    val = referenceGroup.loc[referenceGroup.index == x]["total cow herd"]
#    if val.shape[0] > 0:
#        return val[0]
#    return np.nan
#result = [retrieveHerdAndGallons(x, referenceGroup) for x in RawDairy['sample id']]
#result

In [None]:
#RawDairy["cow herd size"] = result  #on this data there is already cow herd size

In [None]:
#RawDairy.head()

In [None]:
#missingCowHerdEntries = RawDairy["cow herd size"].isnull().sum() 
#print(RawDairy.shape[0]-missingCowHerdEntries, "out of", RawDairy.shape[0], "entries have gallon per cow estimates")

In [None]:
referenceGroup = gallonsPerCow.groupby(["vendor code"]).mean()
def retrieveHerdAndGallons(x, referenceGroup):
    val = referenceGroup.loc[referenceGroup.index == x]["total gallons"]
    if val.shape[0] > 0:
        return val[0]
    return np.nan
result = [retrieveHerdAndGallons(x, referenceGroup) for x in RawDairy['sample id']]
#result

In [None]:
RawDairy["gallons this month"] = result  #on this data there is already cow herd size

In [None]:
missingCowHerdEntries = RawDairy["gallons this month"].isnull().sum() 
print(RawDairy.shape[0]-missingCowHerdEntries, "out of", RawDairy.shape[0], "entries have gallon per cow estimates")

## Raw data histograms and scatterplots


columns worth graphing: 
'analysis date/time', '', 'igg, as-is %', 'moisture, %', 'igg, dry base %'],
     

In [None]:
def InverseBadColumns(df, listBadColumns):    
    goodColumns = []
    for column in df.columns:
        if column not in listBadColumns:
            goodColumns.append(column)
    return goodColumns

In [None]:
RawDairy.columns


In [None]:
listBadColumns = ['location',
       'material', 'category', 'subcategory', 'username', 
       'analysis id',  'comments', 'seconds ago',
       'error flags']

In [None]:
plotRawData = True

In [None]:
#RawDairy.columns
# 'analysis id', 'sample id', 'comments',
#       'igg, as-is %', 'moisture, %', 'igg, dry base %', 'dairy name',
#       'cow herd', 'region', 'days ago', 

In [None]:
if plotRawData == True:
    fig = px.scatter(RawDairy, x="igg, as-is %", y = "igg, dry base %",trendline="ols")
    fig.show()

In [None]:
if plotRawData == True:
    sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
#sns.scatterplot(data=RawDairy, x="moisture, %", y = "igg, as-is %", hue ="days ago", legend = "brief")

fig = px.scatter(RawDairy, x="moisture, %", y = "igg, as-is %", trendline="ols")
fig.show()

In [None]:
#sns.scatterplot(data=RawDairy, x="moisture, %", y = "igg, dry base %", hue ="days ago", legend = "brief")

In [None]:
fig = px.scatter(RawDairy, x="moisture, %", y = "igg, dry base %", trendline="ols")
fig.show()
#note, dry base igg is a derived column from moisture and igg as is, and is not directly measured. 

In [None]:
fig = px.scatter(RawDairy,  x="days ago", y = "igg, as-is %",  marginal_y="violin",
           marginal_x="violin", trendline="ols", template="simple_white", title="Overall Trend of IGG as is %")
fig.show()


#if plotRawData == True:
 #   sns.scatterplot(data=RawDairy, x="days ago", y = "igg, as-is %",)

In [None]:
#while not gallonsPerCowScanned:
#    requestRawPlots = input("Would you like the histograms and scatterplots of #the cleaned raw data?")
#    try:
#        gallonsPerCowScanned = True
#        break
#    except Exception as e: 
#        print(e)
#        print("Please try again, or restart")

#requestRawPlots = input("Would you like the histograms and scatterplots of the cleaned raw data?")

## gallons per cow data
### note: may represent only half of the data, there also may be large variation in gallons per month estimates

In [None]:
RawDairyWithGallons = dropNullsFromColumn(df = RawDairy, column = "gallons this month")

In [None]:
print(len(RawDairyWithGallons["dairy name"].unique()), "out of", len(RawDairy["dairy name"].unique()), "dairies have gallon data")

In [None]:
RawDairyWithGallons["gallons per cow"] = RawDairyWithGallons["gallons this month"]/RawDairyWithGallons["cow herd"]*12
#RawDairyWithGallons.head()

Columns worth graphing: 'igg, as-is %', 'moisture, %', 'igg, dry base %', 'dairy name',
       'cow herd', 'region', 'days ago', 'gallons', 'gallons per cow'  (*)gallons per cow * cow herd? 

In [None]:
goodColumns = ['igg, as-is %', 'moisture, %', 'igg, dry base %', 'dairy name', 'cow herd', 'region', 'days ago', 'gallons this month', 'gallons per cow'] 

In [None]:
fig = px.scatter(RawDairyWithGallons, hover_data=goodColumns, title="IGG as is % to Gallons per Cow", x="gallons per cow", y = 'igg, as-is %', color = "region", size = 'cow herd', hover_name="dairy name", trendline="ols")
fig.show()

In [None]:
RawDairyWithGallonsGroupedByDairy = RawDairyWithGallons.groupby(["dairy name"]).mean()
RawDairyWithGallonsGroupedByDairy = RawDairyWithGallonsGroupedByDairy.reset_index()


fig = px.scatter(RawDairyWithGallonsGroupedByDairy, title="IGG as is % to Gallons per Cow (Grouped by Dairy)", color = "igg, dry base %", marginal_x="violin", \
                 x="gallons per cow", y = 'igg, as-is %', size = 'cow herd', \
                 hover_data=["sample count", "igg standard dev"], hover_name="dairy name", trendline="ols")
fig.show()

In [None]:
fig = px.scatter(RawDairyWithGallonsGroupedByDairy, title="IGG as is % to Gallons per Cow (Grouped by Dairy)", color = "igg standard dev", marginal_x="violin", x="gallons per cow", y = 'igg, as-is %', size = 'cow herd', hover_name="dairy name", trendline="ols")
fig.show()

In [None]:
fig = px.scatter(RawDairyWithGallonsGroupedByDairy, title="", color = "sample count", marginal_x="violin", x="gallons per cow", y = 'moisture, %', size = 'cow herd', hover_name="dairy name", trendline="ols")
fig.show()

## Finished with gallon data 
#### (returned to the full dataset)

In [None]:
RawDairyGroupedByDairy = RawDairy.groupby(["dairy name"]).mean()
RawDairyGroupedByDairy = RawDairyGroupedByDairy.reset_index()
#RawDairyGroupedByDairy

In [None]:
#RawDairyGroupedByDairy = pd.merge(RawDairyGroupedByDairy, RawDairy)
#RawDairyGroupedByDairy  #Why does it set sample counts to 1??

In [None]:
fig = px.scatter(RawDairyGroupedByDairy,title = "igg as is % to cow herd size",  x="cow herd", y = 'igg, as-is %', color = "sample count", hover_name="dairy name", trendline="ols")
fig.show()

In [None]:
fig = px.scatter(RawDairyGroupedByDairy, title = "mean values for dairy igg and moisture", x="moisture, %", y = 'igg, as-is %', hover_name="dairy name", trendline="ols", size = "cow herd", color = "sample count")
fig.show()

In [None]:
def filter_by_freq(df: pd.DataFrame, column: str, min_freq: int) -> pd.DataFrame:
    """Filters the DataFrame based on the value frequency in the specified column.

    :param df: DataFrame to be filtered.
    :param column: Column name that should be frequency filtered.
    :param min_freq: Minimal value frequency for the row to be accepted.
    :return: Frequency filtered DataFrame.
    """
    # Frequencies of each value in the column.
    freq = df[column].value_counts()
    # Select frequent values. Value is in the index.
    frequent_values = freq[freq >= min_freq].index
    # Return only rows with value frequency above threshold.
    return df[df[column].isin(frequent_values)]

In [None]:

minInclusion = 2
minInclusionRawDairy = filter_by_freq(RawDairy, column = "dairy name", min_freq = minInclusion)

In [None]:
print("double click a dairy name to see it alone, double click again to revert")
fig = px.scatter(minInclusionRawDairy,  x="days ago", y = "igg, as-is %",
            trendline="ols", color = "dairy name", title="Individual Dairy Trends for IGG as is %")
fig.show()

## Build a Graph

In [None]:


def UserRequest(viableAnswers):
    #IsStrNull = str1 : str1 or None
    print("please enter one of these viable answers, or type finish: ", str(viableAnswers)[1:-1])
    userInput = ""
    while True: #userInput not in ["escape","finish","end","break","done","stop"]:
        userInput = input()
        if userInput in viableAnswers:
            return userInput
        if userInput.lower() in ["escape","finish","end","break","done","stop"]:
            return False
        else:
            print("invalid user input, please try again")

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [None]:
def InverseBadColumns(df, listBadColumns):    
    goodColumns = []
    for column in df.columns:
        if column not in listBadColumns:
            goodColumns.append(column)
    return goodColumns

#InverseBadColumns(RawDairy, ["location","material","category",'subcategory', \
#               'username','analysis id', 'sample id', 'comments','seconds ago'])

In [None]:
def drawScatter(df, listOfAttributes, listBadColumns): #e.g. ["dairy name", ""]
    goodColumns = InverseBadColumns(df, listBadColumns)
    fig = px.scatter(RawDairy, x=isStrNull(listOfAttributes[0]), y = isStrNull(listOfAttributes[1]), \
                     color = isStrNull(listOfAttributes[2]),\
                      size = isStrNull(listOfAttributes[3]), \
                      hover_data=goodColumns, trendline="ols", title = "size = "+listOfAttributes[3])
    fig.show()
    
    #hover_name=listOfAttributes[4]
#listBadColumns = ["location","material","category",'subcategory', \
               #'username','analysis id', 'sample id', 'comments','seconds ago']
####                   x =            y =        color =     size =              hovername    sidechart = violin or box or...
#listOfAttributes = ["sample count", "cow herd", "region", "moisture, %","dairy name", "violin"]
#drawScatter(RawDairy, listOfAttributes, listBadColumns)

In [None]:
def isStrNull(str1):
    if str1.lower() == "none":
        return None
    else:
        return str1


In [None]:
def BuildAScatter(datasets, listBadColumns):
    print("Select a dataset to graph")
    listDatasets = []
    for df in datasets:
        listDatasets.append(get_df_name(df))
    userInput = UserRequest(listDatasets)
    if userInput == False:
        return False
        
    for df in datasets:
        if get_df_name(df) == userInput:
            chosenDF = df
    usefulColumns = InverseBadColumns(chosenDF, listBadColumns)
    
    print("\nuseful columns: ")
    listOfAttributes = []
    print("Enter an X value")
    userInput = UserRequest(usefulColumns + ["None"])
    listOfAttributes.append(userInput)
    if userInput in usefulColumns:
        usefulColumns.remove(userInput)
    
    print("Enter a Y value")
    userInput = UserRequest(usefulColumns + ["None"])
    listOfAttributes.append(userInput)
    if userInput in usefulColumns:
        usefulColumns.remove(userInput)
    
    print("Enter a color value, or None")
    userInput = UserRequest(usefulColumns + ["None"])
    listOfAttributes.append(userInput)
    if userInput in usefulColumns:
        usefulColumns.remove(userInput)
    
    print("Enter a size value, or None")
    userInput = UserRequest(usefulColumns + ["None"])
    listOfAttributes.append(userInput)
    if userInput in usefulColumns:
        usefulColumns.remove(userInput)
    
    #repeat this for x number of scatter inputs, e.g. x val, y val, color, size, error bars, violin/box side plots etc.
    #plug this list of entries into a new function that draws the scatter based off of a list.
    
    drawScatter(chosenDF, listOfAttributes, listBadColumns)
    
    
print("warning, some entries have missing entries and will throw an error\
 if used, or are invalid value types (e.g. dairy name as an x value)")
BuildAScatter([RawDairy, minInclusionRawDairy, RawDairyWithGallonsGroupedByDairy, RawDairyWithGallons], \
              ["location","material","category",'subcategory', \
               'username','analysis id', 'sample id', 'comments','seconds ago','igg standard dev'])

## Output an excel file with cleaned and derived data

In [None]:
RawDairy.to_excel("FinalOutput.xlsx",
             sheet_name='Sheet_name_1')  

## TODO

create an estimator for each dairy of how gallons per month varies throughout the year, use that to correct the gallons per cow per month estimate.

create several "build your own graph" interfaces with user input.

learn how to use toggles/scales on graphs

estimate land in costs

should merge on analysis ID?

weighted variance? 
