In [63]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle as pkl
from os.path import join

In [64]:
DATA_PATH= join("..", "..", "resources", "database", "Vegetable_market.csv")

data = pd.read_csv(DATA_PATH)

In [65]:
data

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [66]:
data_x = data.iloc[:, :-1]

data_x

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition
0,potato,winter,jan,15,no,fresh
1,tomato,winter,jan,15,no,fresh
2,peas,winter,jan,15,no,fresh
3,pumkin,winter,jan,15,no,fresh
4,cucumber,winter,jan,15,no,fresh
...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh
117,ginger,winter,jan,15,no,fresh
118,potato,summer,apr,32,no,fresh
119,peas,summer,apr,33,no,fresh


In [67]:
data_y = data.iloc[:, -1]

data_y = data_y.values.reshape(-1, 1)

data_y

array([[ 20],
       [ 50],
       [ 70],
       [ 25],
       [ 20],
       [130],
       [ 10],
       [ 35],
       [ 35],
       [ 45],
       [150],
       [ 45],
       [ 20],
       [ 80],
       [ 30],
       [ 20],
       [ 70],
       [ 20],
       [ 25],
       [100],
       [ 30],
       [ 80],
       [ 50],
       [ 60],
       [ 25],
       [ 70],
       [ 70],
       [ 20],
       [130],
       [170],
       [ 40],
       [ 20],
       [200],
       [ 15],
       [ 10],
       [ 40],
       [200],
       [ 40],
       [250],
       [ 90],
       [ 16],
       [ 30],
       [ 40],
       [ 15],
       [ 12],
       [ 50],
       [ 15],
       [ 25],
       [ 28],
       [ 35],
       [120],
       [ 75],
       [ 18],
       [ 80],
       [ 40],
       [ 20],
       [ 70],
       [ 70],
       [ 25],
       [100],
       [ 30],
       [120],
       [ 50],
       [ 60],
       [ 25],
       [ 80],
       [ 15],
       [ 45],
       [190],
       [ 50],
       [210],
      

In [68]:
def encode_data(data: pd.Series) -> pd.DataFrame:
    encoder = LabelEncoder()
    data_encoded = encoder.fit_transform(data)

    return data_encoded

In [69]:
def scale_data(data: pd.Series) -> pd.DataFrame:
    scaler = StandardScaler()
    
    data_scaled = scaler.fit_transform(data)

    return data_scaled

In [70]:
def one_hot_encoding(data: pd.DataFrame) -> pd.DataFrame:
    encoder = OneHotEncoder(sparse_output=False)

    data_encoded = encoder.fit_transform(data)

    df = pd.DataFrame(data=data_encoded, columns=encoder.get_feature_names_out())

    return df

In [71]:
x_categorical = ['Vegetable', 'Season',
                'Month', 'Deasaster Happen in last 3month',
                'Vegetable condition']

x_numerical = ["Temp"]

In [72]:
x_encoded = data_x[x_categorical].apply(lambda x: encode_data(x))

x_encoded

Unnamed: 0,Vegetable,Season,Month,Deasaster Happen in last 3month,Vegetable condition
0,13,4,4,0,1
1,16,4,4,0,1
2,11,4,4,0,1
3,14,4,4,0,1
4,6,4,4,0,1
...,...,...,...,...,...
116,2,4,4,1,1
117,8,4,4,0,1
118,13,3,1,0,1
119,11,3,1,0,1


In [73]:
data_x_transformed = x_encoded

data_x_transformed.insert(3, "Temp", data_x["Temp"])

data_x_transformed

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition
0,13,4,4,15,0,1
1,16,4,4,15,0,1
2,11,4,4,15,0,1
3,14,4,4,15,0,1
4,6,4,4,15,0,1
...,...,...,...,...,...,...
116,2,4,4,15,1,1
117,8,4,4,15,0,1
118,13,3,1,32,0,1
119,11,3,1,33,0,1


In [74]:
data_x_scaled = data_x_transformed.apply(lambda x: scale_data(x.values.reshape(-1, 1)).squeeze())

In [75]:
data_x_scaled

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition
0,0.845810,0.812753,0.151192,-1.065944,-0.599625,-0.067293
1,1.450197,0.812753,0.151192,-1.065944,-0.599625,-0.067293
2,0.442885,0.812753,0.151192,-1.065944,-0.599625,-0.067293
3,1.047272,0.812753,0.151192,-1.065944,-0.599625,-0.067293
4,-0.564428,0.812753,0.151192,-1.065944,-0.599625,-0.067293
...,...,...,...,...,...,...
116,-1.370278,0.812753,0.151192,-1.065944,1.667708,-0.067293
117,-0.161503,0.812753,0.151192,-1.065944,-0.599625,-0.067293
118,0.845810,-0.049906,-1.293088,0.765841,-0.599625,-0.067293
119,0.442885,-0.049906,-1.293088,0.873593,-0.599625,-0.067293


In [76]:
data_y_scaled = scale_data(data_y)

In [77]:
data_y_scaled

array([[-0.7274458 ],
       [-0.10975498],
       [ 0.3020389 ],
       [-0.62449733],
       [-0.7274458 ],
       [ 1.53742054],
       [-0.93334274],
       [-0.41860039],
       [-0.41860039],
       [-0.21270345],
       [ 1.94921442],
       [-0.21270345],
       [-0.7274458 ],
       [ 0.50793584],
       [-0.52154886],
       [-0.7274458 ],
       [ 0.3020389 ],
       [-0.7274458 ],
       [-0.62449733],
       [ 0.91972972],
       [-0.52154886],
       [ 0.50793584],
       [-0.10975498],
       [ 0.09614196],
       [-0.62449733],
       [ 0.3020389 ],
       [ 0.3020389 ],
       [-0.7274458 ],
       [ 1.53742054],
       [ 2.3610083 ],
       [-0.31565192],
       [-0.7274458 ],
       [ 2.97869913],
       [-0.83039427],
       [-0.93334274],
       [-0.31565192],
       [ 2.97869913],
       [-0.31565192],
       [ 4.00818383],
       [ 0.71383278],
       [-0.80980458],
       [-0.52154886],
       [-0.31565192],
       [-0.83039427],
       [-0.89216335],
       [-0

In [78]:
x_train, x_test, y_train, y_test = train_test_split(
                                    data_x_scaled, data_y,
                                    test_size=0.25, random_state=0
)

In [79]:
KNN_BASED_DATA_PATH = join("..", "..", "resources", "database", "knn_classifier_data.pkl")

with open(KNN_BASED_DATA_PATH, "wb") as file:
    pkl.dump([x_train, x_test, y_train, y_test], file)

In [80]:
x_train, x_test, y_train, y_test = train_test_split(
                                    data_x_scaled, data_y_scaled,
                                    test_size=0.25, random_state=0
)

In [81]:
KNN_BASED_DATA_PATH = join("..", "..", "resources", "database", "knn_regressor_data.pkl")

with open(KNN_BASED_DATA_PATH, "wb") as file:
    pkl.dump([x_train, x_test, y_train, y_test], file)