In [38]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import re
import os
from path_info import path as p
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv(p.path_to_nutrition_final_dataset)
print(df.head())
print(df.info())
print(df.isnull().sum())

   Unnamed: 0           category         product             name  \
0           0             Grains      Cornstarch       Cornstarch   
1           1  Legumes and seeds     Nuts pecans     Nuts, pecans   
2           2         Vegetables    Eggplant raw    Eggplant, raw   
3           3             Grains   Teff uncooked   Teff, uncooked   
4           4             Fruits  Sherbet orange  Sherbet, orange   

  natural_form serving_size  calories total_fat saturated_fat cholesterol  \
0       edible        100 g       381      0.1g           NaN           0   
1       edible        100 g       691       72g          6.2g           0   
2          raw        100 g        25      0.2g           NaN           0   
3          raw        100 g       367      2.4g          0.4g           0   
4       edible        100 g       144        2g          1.2g         1mg   

   ...      fat saturated_fatty_acids monounsaturated_fatty_acids  \
0  ...   0.05 g               0.009 g                

In [2]:
nutrients = df[['category', 'product', 'serving_size', 'calories', 'protein', 'total_fat', 'carbohydrate', 'fiber', 'sugars']]
nutrients.head(-5)

Unnamed: 0,category,product,serving_size,calories,protein,total_fat,carbohydrate,fiber,sugars
0,Grains,Cornstarch,100 g,381,0.26 g,0.1g,91.27 g,0.9 g,0.00 g
1,Legumes and seeds,Nuts pecans,100 g,691,9.17 g,72g,13.86 g,9.6 g,3.97 g
2,Vegetables,Eggplant raw,100 g,25,0.98 g,0.2g,5.88 g,3.0 g,3.53 g
3,Grains,Teff uncooked,100 g,367,13.30 g,2.4g,73.13 g,8.0 g,1.84 g
4,Fruits,Sherbet orange,100 g,144,1.10 g,2g,30.40 g,1.3 g,24.32 g
...,...,...,...,...,...,...,...,...,...
8779,Red Meat,"Beef braised cooked all grades trimmed to 1/8""...",100 g,289,28.82 g,18g,0.00 g,0.0 g,0.00 g
8780,Red Meat,"Beef raw select trimmed to 1/8"" fat separable ...",100 g,148,22.55 g,6.4g,0.00 g,0.0 g,0.00 g
8781,Red Meat,"Beef raw choice trimmed to 1/8"" fat separable ...",100 g,161,21.62 g,8.3g,0.00 g,0.0 g,0.00 g
8782,Oil,Oil uses similar to 95 degree hard butter conf...,100 g,884,0.00 g,100g,0.00 g,0.0 g,0.00 g


In [3]:
columns_to_convert = ['serving_size' ,'protein', 'total_fat', 'carbohydrate', 'fiber', 'sugars']

for column in columns_to_convert:
    nutrients[column] = pd.to_numeric(nutrients[column].apply(lambda x: float(re.sub(r'\s*g$', '', x)) if isinstance(x, str) and re.search(r'\s*g$', x) else x))
    
nutrients.head()  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nutrients[column] = pd.to_numeric(nutrients[column].apply(lambda x: float(re.sub(r'\s*g$', '', x)) if isinstance(x, str) and re.search(r'\s*g$', x) else x))


Unnamed: 0,category,product,serving_size,calories,protein,total_fat,carbohydrate,fiber,sugars
0,Grains,Cornstarch,100.0,381,0.26,0.1,91.27,0.9,0.0
1,Legumes and seeds,Nuts pecans,100.0,691,9.17,72.0,13.86,9.6,3.97
2,Vegetables,Eggplant raw,100.0,25,0.98,0.2,5.88,3.0,3.53
3,Grains,Teff uncooked,100.0,367,13.3,2.4,73.13,8.0,1.84
4,Fruits,Sherbet orange,100.0,144,1.1,2.0,30.4,1.3,24.32


In [4]:
# nutrients = nutrients.drop(['product'], axis=1)

In [7]:
nutrients = nutrients.dropna()

In [21]:
nutrients.to_csv('nutrients_data')

In [151]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
X = nutrients[['calories', 'protein', 'total_fat', 'carbohydrate', 'fiber', 'sugars']]
y = nutrients[['category']]

In [141]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=100
)

In [147]:
clf = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
clf.fit(X_train, y_train)

In [149]:
clf = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 33   0   0   0   0  14   0   0   0   0   1  13   0   6  53]
 [ 15   0   0   0   0  13   0   0   1   0   3  70   0   2  13]
 [  2   0   0   0   0  11   0   0   0   0  10 100   0  15  13]
 [  0   0   0   0   0  19   0   0   0   0   6  44   2   5   2]
 [  0   0   0   0   0  27   0   0   0   0   0  14   0   6 128]
 [  0   0   0   0   0 147   0   0   0   0   0  13   0  29  19]
 [  0   0   0   0   0  10   0   0   1   0   2  61   3   4  67]
 [  1   0   0   0   0  41   0   0   4   0   0  18   0  31   7]
 [  1   0   0   0   0   0   0   0  17   0   0   1   0   0   0]
 [  1   0   0   0   0   2   0   0   3   0 107  46  17   0   1]
 [  2   0   0   0   0   1   0   0   3   0 537  43  15   0   2]
 [  5   0   0   0   0  16   0   0   0   0   0 109   0   0  39]
 [  0   0   0   0   0   1   0   0   1   0  41  23  34   0   7]
 [  0   0   0   0   0  56   0   0   0   0   1  35   0 199   2]
 [  2   0   0   0   0   5   0   0   0   0   0  25   0   0 138]]
Accuracy: 0.4603716344330679
                         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
nutrients_data = pd.get_dummies(nutrients, columns=['category', 'product', 'serving_size'], dtype=int)

In [25]:
nutrients_data

Unnamed: 0,calories,protein,total_fat,carbohydrate,fiber,sugars,category_Beverages,category_Condiments,category_Dairy,category_Eggs,...,product_Yogurt parfait with fruit and granola lowfat,product_Yogurt sweetened with low calorie sweetener lowfat milk vanilla flavor,product_Yogurt sweetened with low-calorie sweetener nonfat milk vanilla or lemon flavor,product_Yogurt whole milk fruit Greek,product_Yogurt whole milk plain Greek,product_Yogurt with low calorie sweetener lowfat fruit,product_Yogurt with low-calorie sweetener nonfat milk flavors not chocolate frozen,product_Yokan prepared from adzuki beans and sugar,product_Zwieback,serving_size_100.0
0,381,0.26,0.1,91.27,0.9,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,691,9.17,72.0,13.86,9.6,3.97,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,25,0.98,0.2,5.88,3.0,3.53,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,367,13.30,2.4,73.13,8.0,1.84,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,144,1.10,2.0,30.40,1.3,24.32,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,125,23.45,3.5,0.00,0.0,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8785,206,29.59,8.9,0.00,0.0,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8786,277,16.74,23.0,0.00,0.0,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8787,121,23.37,3.0,0.00,0.0,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
