# Analysis

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px

In [2]:
data = pd.read_csv("../resources/Vegetable_market.csv")
#reference https://www.kaggle.com/datasets/sudipsamanta35/vegetable-market

data

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


In [4]:
data.isnull().sum()

Vegetable                          0
Season                             0
Month                              0
Temp                               0
Deasaster Happen in last 3month    0
Vegetable condition                0
Price per kg                       0
dtype: int64

In [5]:
data[["Vegetable", "Vegetable condition", "Season"]].value_counts()

Vegetable        Vegetable condition  Season 
ginger           fresh                winter     7
tomato           fresh                winter     5
pumkin           fresh                winter     5
chilly           fresh                winter     4
peas             fresh                summer     4
okra             fresh                monsoon    4
garlic           fresh                winter     4
radish           fresh                summer     4
potato           fresh                summer     4
brinjal          fresh                winter     4
Raddish          fresh                winter     4
pointed grourd   fresh                monsoon    3
                 avarage              summer     3
peas             scarp                winter     3
Bitter gourd     scarp                winter     3
peas             fresh                monsoon    3
potato           scrap                winter     3
onion            avarage              winter     3
cabage           fresh              

In [6]:
count_vegetable = data["Vegetable"].value_counts().reset_index()

In [7]:
fig = go.Figure(
    go.Bar(
        x= count_vegetable["Vegetable"],
        y= count_vegetable["count"],
        text= round((count_vegetable["count"]/count_vegetable["count"].sum())*100, 2).apply(lambda x: f"{x}%"),
        textposition='outside'
    )
)

fig.update_layout(
    title= "Vegetable Distribution",
    xaxis_title= "Vegetable",
    yaxis_title= "Count",
    barcornerradius=10,
    height=700)

fig

In [8]:
count_season = data[["Season", "Deasaster Happen in last 3month"]].value_counts().reset_index()
count_season

Unnamed: 0,Season,Deasaster Happen in last 3month,count
0,winter,no,50
1,summer,no,27
2,monsoon,yes,11
3,monsoon,no,10
4,summer,yes,10
5,winter,yes,8
6,spring,yes,3
7,autumn,no,2


In [9]:
fig = px.bar(
    data_frame=count_season,
    x="Season",
    y="count",
    text=round((count_season["count"]/count_season["count"].sum())*100, 2).apply(lambda x: f"{x}%"),
    color="Deasaster Happen in last 3month",
)

fig.update_layout(
    title= "Deasaster Distribution",
    xaxis_title= "Season",
    yaxis_title= "Count",
    barcornerradius=10,
    height=700 
)

fig

In [10]:
count_season_yes = count_season.loc[count_season["Deasaster Happen in last 3month"] == "yes"]
count_season_yes

Unnamed: 0,Season,Deasaster Happen in last 3month,count
2,monsoon,yes,11
4,summer,yes,10
5,winter,yes,8
6,spring,yes,3


In [11]:
count_season_yes["count"].mean()

8.0

In [12]:
count_season_no = count_season.loc[count_season["Deasaster Happen in last 3month"] == "no"]
count_season_no

Unnamed: 0,Season,Deasaster Happen in last 3month,count
0,winter,no,50
1,summer,no,27
3,monsoon,no,10
7,autumn,no,2


In [13]:
count_season_no["count"].mean()

22.25

In [14]:
data[[ "Temp", "Season"]].value_counts().reset_index()

Unnamed: 0,Temp,Season,count
0,15,winter,51
1,32,summer,11
2,37,summer,7
3,30,monsoon,6
4,35,summer,6
5,31,monsoon,6
6,29,monsoon,4
7,27,monsoon,4
8,33,summer,4
9,30,spring,3


In [15]:
data_count_veg_temp_price = data[["Vegetable", "Temp", "Price per kg"]].value_counts().reset_index()
data_count_veg_temp_price

Unnamed: 0,Vegetable,Temp,Price per kg,count
0,califlower,37,60,2
1,tomato,15,30,2
2,pointed grourd,30,25,2
3,ginger,15,90,2
4,peas,33,100,2
...,...,...,...,...
104,cucumber,31,21,1
105,cucumber,31,15,1
106,cucumber,31,12,1
107,cucumber,15,20,1


In [16]:
fig = px.bar(
    data_frame=data_count_veg_temp_price,
    x="Vegetable",
    y="Price per kg",
    text=round((data_count_veg_temp_price["count"])).apply(lambda x: f"{x}"),
    color="Temp",
)

fig.update_layout(
    title= "Vegetable x Temperature x Price",
    xaxis_title= "Vegetable",
    yaxis_title= "Price",
    barcornerradius=10,
    height=700 
)

fig

In [17]:
count_month_vegetable = data[["Vegetable", "Month", "Price per kg"]]
count_month_vegetable

Unnamed: 0,Vegetable,Month,Price per kg
0,potato,jan,20
1,tomato,jan,50
2,peas,jan,70
3,pumkin,jan,25
4,cucumber,jan,20
...,...,...,...
116,brinjal,jan,33
117,ginger,jan,88
118,potato,apr,24
119,peas,apr,33


In [18]:
fig = px.scatter(
    data_frame=count_month_vegetable,
    x="Month",
    y="Vegetable",
    color="Price per kg"
)

fig.update_layout(
    title= "Vegetable x Temperature x Price",
    xaxis_title= "Vegetable",
    yaxis_title= "Month",
    barcornerradius=10,
    height=700 
)

fig

# Pré-processing

In [19]:
data_X = data.iloc[:, [0, 1, 2, 4, 5]].values
data_Y = data.iloc[:, 6].values

In [20]:
data_Y

array([ 20,  50,  70,  25,  20, 130,  10,  35,  35,  45, 150,  45,  20,
        80,  30,  20,  70,  20,  25, 100,  30,  80,  50,  60,  25,  70,
        70,  20, 130, 170,  40,  20, 200,  15,  10,  40, 200,  40, 250,
        90,  16,  30,  40,  15,  12,  50,  15,  25,  28,  35, 120,  75,
        18,  80,  40,  20,  70,  70,  25, 100,  30, 120,  50,  60,  25,
        80,  15,  45, 190,  50, 210, 130,  10,  25,  42,  20,  15,  55,
        20,  29,  32,  20, 132,  60,  21,  75,  35,  19,  32,  90,  22,
        35,  21,  90,  33,  55,  30,  45, 170, 150,  24,  30,  35,  23,
        21,  53,  27,  32,  24,  22, 123,  55,  21,  90,  25,  21,  33,
        88,  24,  33,   9])

In [21]:
data_X

array([['potato', 'winter', 'jan', 'no', 'fresh'],
       ['tomato ', 'winter', 'jan', 'no', 'fresh'],
       ['peas', 'winter', 'jan', 'no', 'fresh'],
       ['pumkin', 'winter', 'jan', 'no', 'fresh'],
       ['cucumber', 'winter', 'jan', 'no', 'fresh'],
       ['pointed grourd ', 'winter', 'jan', 'yes', 'fresh'],
       ['Raddish', 'winter', 'jan', 'no', 'fresh'],
       ['Bitter gourd', 'winter', 'jan', 'no', 'fresh'],
       ['onion', 'winter', 'jan', 'no', 'fresh'],
       ['tomato ', 'winter', 'jan', 'no', 'fresh'],
       ['garlic', 'winter', 'jan', 'no', 'fresh'],
       ['cabage', 'winter', 'jan', 'yes', 'fresh'],
       ['califlower', 'winter', 'jan', 'no', 'fresh'],
       ['chilly', 'winter', 'jan', 'no', 'fresh'],
       ['okra', 'winter', 'jan', 'no', 'scrap'],
       ['pumkin', 'winter', 'jan', 'no', 'fresh'],
       ['brinjal', 'winter', 'jan', 'no', 'fresh'],
       ['ginger', 'winter', 'jan', 'no', 'fresh'],
       ['potato', 'summer', 'apr', 'no', 'fresh'],
       ['

In [22]:
data_Y

array([ 20,  50,  70,  25,  20, 130,  10,  35,  35,  45, 150,  45,  20,
        80,  30,  20,  70,  20,  25, 100,  30,  80,  50,  60,  25,  70,
        70,  20, 130, 170,  40,  20, 200,  15,  10,  40, 200,  40, 250,
        90,  16,  30,  40,  15,  12,  50,  15,  25,  28,  35, 120,  75,
        18,  80,  40,  20,  70,  70,  25, 100,  30, 120,  50,  60,  25,
        80,  15,  45, 190,  50, 210, 130,  10,  25,  42,  20,  15,  55,
        20,  29,  32,  20, 132,  60,  21,  75,  35,  19,  32,  90,  22,
        35,  21,  90,  33,  55,  30,  45, 170, 150,  24,  30,  35,  23,
        21,  53,  27,  32,  24,  22, 123,  55,  21,  90,  25,  21,  33,
        88,  24,  33,   9])

In [23]:
vegetables_encoder = LabelEncoder()
season_encoder = LabelEncoder()
month_encoder = LabelEncoder()
desaster_encoder = LabelEncoder()
condition_encoder = LabelEncoder()

In [24]:
data_X[:, 0] = vegetables_encoder.fit_transform(data_X[:, 0])
data_X[:, 1] = season_encoder.fit_transform(data_X[:, 1])
data_X[:, 2] = month_encoder.fit_transform(data_X[:, 2])
data_X[:, 3] = desaster_encoder.fit_transform(data_X[:, 3])
data_X[:, 4] = condition_encoder.fit_transform(data_X[:, 4])

In [25]:
data_X

array([[13, 4, 4, 0, 1],
       [16, 4, 4, 0, 1],
       [11, 4, 4, 0, 1],
       [14, 4, 4, 0, 1],
       [6, 4, 4, 0, 1],
       [12, 4, 4, 1, 1],
       [1, 4, 4, 0, 1],
       [0, 4, 4, 0, 1],
       [10, 4, 4, 0, 1],
       [16, 4, 4, 0, 1],
       [7, 4, 4, 0, 1],
       [3, 4, 4, 1, 1],
       [4, 4, 4, 0, 1],
       [5, 4, 4, 0, 1],
       [9, 4, 4, 0, 3],
       [14, 4, 4, 0, 1],
       [2, 4, 4, 0, 1],
       [8, 4, 4, 0, 1],
       [13, 3, 1, 0, 1],
       [11, 3, 1, 0, 1],
       [15, 3, 1, 0, 1],
       [16, 3, 1, 0, 0],
       [3, 3, 1, 0, 1],
       [4, 3, 1, 0, 1],
       [12, 1, 5, 0, 1],
       [1, 3, 1, 1, 0],
       [0, 0, 10, 0, 1],
       [10, 3, 1, 0, 1],
       [16, 1, 5, 1, 1],
       [7, 0, 9, 0, 1],
       [3, 1, 5, 1, 3],
       [4, 4, 3, 0, 1],
       [5, 3, 8, 1, 1],
       [9, 1, 2, 0, 1],
       [14, 3, 8, 0, 1],
       [2, 3, 8, 0, 1],
       [8, 3, 6, 1, 3],
       [13, 1, 2, 1, 1],
       [11, 1, 5, 1, 1],
       [8, 4, 4, 1, 1],
       [13, 4, 3, 0, 

In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# transformação "OneHot", com o OneHotEncoder(), nas colunas do data_X [0, 1, 2 ,3 ,4]
# as colunas transformadas são as mesmas que passaram pelo LableEncoder
# remainder="passthrough" indica que a transformação também deve manter as colunas principais
onehotencoder = ColumnTransformer(
                transformers=[("OneHot", OneHotEncoder(), [0, 1, 2, 3, 4])], 
                remainder="passthrough")

data_X_scalled = onehotencoder.fit_transform(data_X).toarray()

data_X_scalled

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [27]:
scaler_X = StandardScaler()
data_X_scaled = scaler_X.fit_transform(data_X)

data_X_scaled

array([[ 0.84580954,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 1.45019707,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 0.44288453,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 1.04727205,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [-0.56442802,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 0.64434703,  0.81275288,  0.15119177,  1.66770801, -0.06729266],
       [-1.57174057,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [-1.77320308,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 0.24142202,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [ 1.45019707,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [-0.36296551,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [-1.16881555,  0.81275288,  0.15119177,  1.66770801, -0.06729266],
       [-0.96735304,  0.81275288,  0.15119177, -0.59962535, -0.06729266],
       [-0.76589053,  0.81275288,  0.1

In [28]:
scaler_y = StandardScaler()

data_Y = data_Y.reshape(-1, 1)

data_Y_scaled = scaler_y.fit_transform(data_Y)

data_Y_scaled

array([[-0.7274458 ],
       [-0.10975498],
       [ 0.3020389 ],
       [-0.62449733],
       [-0.7274458 ],
       [ 1.53742054],
       [-0.93334274],
       [-0.41860039],
       [-0.41860039],
       [-0.21270345],
       [ 1.94921442],
       [-0.21270345],
       [-0.7274458 ],
       [ 0.50793584],
       [-0.52154886],
       [-0.7274458 ],
       [ 0.3020389 ],
       [-0.7274458 ],
       [-0.62449733],
       [ 0.91972972],
       [-0.52154886],
       [ 0.50793584],
       [-0.10975498],
       [ 0.09614196],
       [-0.62449733],
       [ 0.3020389 ],
       [ 0.3020389 ],
       [-0.7274458 ],
       [ 1.53742054],
       [ 2.3610083 ],
       [-0.31565192],
       [-0.7274458 ],
       [ 2.97869913],
       [-0.83039427],
       [-0.93334274],
       [-0.31565192],
       [ 2.97869913],
       [-0.31565192],
       [ 4.00818383],
       [ 0.71383278],
       [-0.80980458],
       [-0.52154886],
       [-0.31565192],
       [-0.83039427],
       [-0.89216335],
       [-0

In [29]:
data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(
    data_X_scaled, data_Y_scaled,
    train_size=0.25, random_state=0
)

In [30]:
import pickle

with open("../resources/data_to_learn.pkl", "wb") as file:
    pickle.dump([data_X_train, data_X_test, data_Y_train, data_Y_test, data_X, data_Y], file)