In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [4]:
df = pd.read_csv("chocolate_bars.csv")

In [5]:
df.head(10)

Unnamed: 0,id,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.0,"B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.0,"B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.0,"B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.0,"B,S,C","chewy, off, rubbery",3.0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.0,"B,S,C","fatty, earthy, moss, nutty,chalky",3.0
5,2546,5150,U.S.A.,2021,Uganda,"Semuliki Forest, batch 1",80.0,3.0,"B,S,C","mildly bitter, basic cocoa, fatty",3.25
6,2542,5150,U.S.A.,2021,India,"Anamalai, batch 1",68.0,3.0,"B,S,C","milk brownie, macadamia,chewy",3.5
7,797,A. Morin,France,2012,Bolivia,Bolivia,70.0,4.0,"B,S,C,L","vegetal, nutty",3.5
8,797,A. Morin,France,2012,Peru,Peru,63.0,4.0,"B,S,C,L","fruity, melon, roasty",3.75
9,1011,A. Morin,France,2013,Panama,Panama,70.0,4.0,"B,S,C,L","brief fruit note, earthy, nutty",2.75


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2530 entries, 0 to 2529
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2530 non-null   int64  
 1   manufacturer      2530 non-null   object 
 2   company_location  2530 non-null   object 
 3   year_reviewed     2530 non-null   int64  
 4   bean_origin       2530 non-null   object 
 5   bar_name          2530 non-null   object 
 6   cocoa_percent     2530 non-null   float64
 7   num_ingredients   2443 non-null   float64
 8   ingredients       2443 non-null   object 
 9   review            2530 non-null   object 
 10  rating            2530 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 217.6+ KB


In [7]:
df.describe()

Unnamed: 0,id,year_reviewed,cocoa_percent,num_ingredients,rating
count,2530.0,2530.0,2530.0,2443.0,2530.0
mean,1429.800791,2014.374308,71.639723,3.041343,3.196344
std,757.648556,3.968267,5.616724,0.913728,0.445321
min,5.0,2006.0,42.0,1.0,1.0
25%,802.0,2012.0,70.0,2.0,3.0
50%,1454.0,2015.0,70.0,3.0,3.25
75%,2079.0,2018.0,74.0,4.0,3.5
max,2712.0,2021.0,100.0,6.0,4.0


In [8]:
df.isna().sum()

id                   0
manufacturer         0
company_location     0
year_reviewed        0
bean_origin          0
bar_name             0
cocoa_percent        0
num_ingredients     87
ingredients         87
review               0
rating               0
dtype: int64

In [10]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f'Унікальні значення для стовбця "{column}": {unique_values}')

Унікальні значення для стовбця "id": [2454 2458 2542 2546  797 1011 1015 1019 1315 1319 1676 1680 1704 1876
 2206 2648 1462 1470 2462 2470  705 2438 2442  370  316  502  508  636
 1061 1173 1215 1992 1944 1125 1133 1129 1732 1728 2044  147  129  175
  304  363  544  470  725  327  464  322 1964 1145 1494 1498   75  123
  170  979 2088 2092 2434  572 1065 1259 1852 2586 1379 1375 1602 1534
 1598 1904 1928 1724 1900 1908 1924 2020 2028 2024 2068 2254 2450 2330
 2166 2162  300  355  486  600  531  745  729  947 1193 1181 2562 2566
  141  331  647  661 1780 2056 2672  999  995 1474 2146 1454 2290 2294
  983 1295 1554 1980  955 1880 1840 1868 2374 1948 1784 1788  586 1804
 1800 1864 2190 1768 2246  773  757 1141 1482 1486 2084  233  256  414
  423  431  558  565  478  963 2108 2114 2422 2574 1331 2590 1046  911
 1752 1756 1740 1996   81   24   32   48  199  336  395  761  629  672
 1042 1038 1418 1339 1912 2250 2554  341 1267 1271 2194 2096 1255 1355
 1984 1518 1514 1149 1235 1231 1638 2052