In [90]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [91]:
#Dataset taken from https://www.kaggle.com/datasets/mohitpoudel/honey-production-in-us-20102021
honeyDF = pd.read_csv("US_honey_production_dataset.csv",
                 sep=",",  # delimiter
                 header=0, # header in first row
                 index_col=0 # ids in first column
                )

In [92]:
honeyDF.head(10)

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
0,Alabama,9000,54,73000,73000,240.0,1166000,2010
1,Arizona,24000,77,665000,665000,152.0,2809000,2010
2,Arkansas,25000,60,360000,360000,147.0,2205000,2010
3,California,410000,67,6318000,6318000,155.0,42579000,2010
4,Colorado,34000,56,533000,533000,152.0,2894000,2010
5,Florida,200000,69,1794000,1794000,156.0,21528000,2010
6,Georgia,55000,46,152000,152000,167.0,4225000,2010
7,Hawaii,10000,77,239000,239000,275.0,2118000,2010
8,Idaho,97000,27,1179000,1179000,161.0,4217000,2010
9,Illinois,9000,41,92000,92000,278.0,1026000,2010


In [93]:
honeyDF.tail(10)

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
469,SouthDakota,250000,49,5268000,5268000,2.27,27808000,2021
470,Tennessee,8000,56,81000,81000,5.0,2240000,2021
471,Texas,137000,56,384000,384000,2.3,17646000,2021
472,Utah,31000,33,92000,92000,2.18,2230000,2021
473,Vermont,7000,47,76000,76000,3.28,1079000,2021
474,Virginia,6000,40,79000,79000,8.23,1975000,2021
475,Washington,96000,32,1206000,1206000,2.52,7741000,2021
476,WestVirginia,6000,43,136000,136000,4.8,1238000,2021
477,Wisconsin,42000,47,750000,750000,2.81,5547000,2021
478,Wyoming,38000,58,242000,242000,2.07,4562000,2021


In [94]:
#Checking Pandas dataframe (imported csv) for messiness
honeyDF.describe(include="all") 

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
count,479,479.0,479.0,479.0,479.0,479.0,479.0,479.0
unique,41,,,,,,,
top,Alabama,,,,,,,
freq,12,,,,,,,
mean,,66578.2881,53.167015,912772.4,912772.4,173.884614,7832610.0,2015.505219
std,,99696.073858,17.066853,1698062.0,1698062.0,147.478891,12093930.0,3.457372
min,,3000.0,27.0,12000.0,12000.0,1.3,238000.0,2010.0
25%,,10000.0,41.0,90000.0,90000.0,3.85,1611500.0,2012.5
50%,,26000.0,50.0,255000.0,255000.0,192.0,3252000.0,2016.0
75%,,77500.0,62.0,1001500.0,1001500.0,243.5,8350500.0,2018.5


<h1> Replacing Qualitative State Data with Quantitative Data </h1>

In [95]:
#Replacing column data under pd.get_dummies led to 41 separate columns; too messy
#assistance on replacement/dummy variables found via 
#https://www.geeksforgeeks.org/how-to-convert-categorical-variable-to-numeric-in-pandas/#:~:text=Method%201%3A%20Using%20replace(),education%20levels%20into%20numeric%20terms.
honeyDF['state'].replace(['Alabama','Arizona','Arkansas','California','Colorado','Florida','Georgia','Hawaii',
                     'Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine',
                     'Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','NewJersey',
                     'NewMexico','NewYork','NorthCarolina','NorthDakota','Ohio','Oregon','Pennsylvania','SouthDakota',
                     'Tennessee','Texas','Utah','Vermont','Virginia','Washington','WestVirginia','Wisconsin',
                     'Wyoming','SouthCarolina'],
[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
31,32,33,34,35,36,37,38,39,40], inplace=True)

In [96]:
honeyDF.head(10)

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
0,0,9000,54,73000,73000,240.0,1166000,2010
1,1,24000,77,665000,665000,152.0,2809000,2010
2,2,25000,60,360000,360000,147.0,2205000,2010
3,3,410000,67,6318000,6318000,155.0,42579000,2010
4,4,34000,56,533000,533000,152.0,2894000,2010
5,5,200000,69,1794000,1794000,156.0,21528000,2010
6,6,55000,46,152000,152000,167.0,4225000,2010
7,7,10000,77,239000,239000,275.0,2118000,2010
8,8,97000,27,1179000,1179000,161.0,4217000,2010
9,9,9000,41,92000,92000,278.0,1026000,2010


In [97]:
#Drop any N/A
honeyDF.dropna()


Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
0,0,9000,54,73000,73000,240.00,1166000,2010
1,1,24000,77,665000,665000,152.00,2809000,2010
2,2,25000,60,360000,360000,147.00,2205000,2010
3,3,410000,67,6318000,6318000,155.00,42579000,2010
4,4,34000,56,533000,533000,152.00,2894000,2010
...,...,...,...,...,...,...,...,...
474,35,6000,40,79000,79000,8.23,1975000,2021
475,36,96000,32,1206000,1206000,2.52,7741000,2021
476,37,6000,43,136000,136000,4.80,1238000,2021
477,38,42000,47,750000,750000,2.81,5547000,2021


In [98]:
honeyDF.describe(include="all") 

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
count,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0
mean,19.776618,66578.2881,53.167015,912772.4,912772.4,173.884614,7832610.0,2015.505219
std,11.85469,99696.073858,17.066853,1698062.0,1698062.0,147.478891,12093930.0,3.457372
min,0.0,3000.0,27.0,12000.0,12000.0,1.3,238000.0,2010.0
25%,9.5,10000.0,41.0,90000.0,90000.0,3.85,1611500.0,2012.5
50%,19.0,26000.0,50.0,255000.0,255000.0,192.0,3252000.0,2016.0
75%,30.0,77500.0,62.0,1001500.0,1001500.0,243.5,8350500.0,2018.5
max,40.0,550000.0,131.0,12995000.0,12995000.0,874.0,83859000.0,2021.0


<h1> Correlation Matrix of Honey Data Table to find Correlation between Column Data </h1>

In [70]:
correlation_matrix = honeyDF.corr()
fig = px.imshow(correlation_matrix, text_auto=True)
fig.show()


In [78]:
# plotting the line chart
fig = px.histogram(honeyDF,x = 'colony_number', y= "value_of_prod")
 
# showing the plot
fig.show()

In [None]:
# plotting the line chart

fig = px.line(honeyDF, y="sepal_width",x='year',
              color='state')
 
# showing the plot
fig.show()

In [104]:

grouped_multiple = honeyDF.groupby(["year","state"])

grouped_multiple.head()

Unnamed: 0,state,colony_number,yield_per_colony,productions,stocks,average_price,value_of_prod,year
0,0,9000,54,73000,73000,240.00,1166000,2010
1,1,24000,77,665000,665000,152.00,2809000,2010
2,2,25000,60,360000,360000,147.00,2205000,2010
3,3,410000,67,6318000,6318000,155.00,42579000,2010
4,4,34000,56,533000,533000,152.00,2894000,2010
...,...,...,...,...,...,...,...,...
474,35,6000,40,79000,79000,8.23,1975000,2021
475,36,96000,32,1206000,1206000,2.52,7741000,2021
476,37,6000,43,136000,136000,4.80,1238000,2021
477,38,42000,47,750000,750000,2.81,5547000,2021
