In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn


: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

In [None]:
df = pd.read_csv("flavors_of_cacao.csv")
df.head()

In [None]:
df.columns = df.columns.str.replace('\xa0', '', regex=True).str.replace('\n', ' ').str.strip()
df.rename(columns={
    "Company (Maker-if known)": "Company",
    "Review Date": "Review_Year",
    "Company Location": "Company_Location",
    "Cocoa Percent": "Cocoa_Percent",
    "Bean Type": "Bean_Type",
    "Broad Bean Origin": "Broad_Bean_Origin"
}, inplace=True)

df.head()

In [None]:
df_clean = df.dropna()
print("✅ Missing values removed!")

In [None]:
print("📊 Total tuples:", df_clean.shape[0])
print("🏭 Unique companies:", df_clean['Company'].nunique())
print("📅 Reviews in 2013:", df_clean[df_clean['Review_Year'] == 2013].shape[0])
print("❓ Missing 'Bean_Type' values:", df['Bean_Type'].isna().sum())

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df_clean['Rating'], bins=20, kde=True)
plt.title("Distribution of Chocolate Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_clean['Cocoa_Percent'] = df_clean['Cocoa_Percent'].str.replace('%', '').astype(float)
df_clean.head()

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(df_clean['Cocoa_Percent'], df_clean['Rating'], alpha=0.3)
plt.title("Cocoa % vs Rating")
plt.xlabel("Cocoa Percent")
plt.ylabel("Rating")
plt.grid(True)
plt.show()

In [None]:
scaler = MinMaxScaler()
df_clean['Normalized_Rating'] = scaler.fit_transform(df_clean[['Rating']])
df_clean[['Rating', 'Normalized_Rating']].head()

In [None]:
avg_ratings = df_clean.groupby('Company')['Rating'].mean().sort_values(ascending=False)
avg_ratings.head(10)

In [None]:
le_company = LabelEncoder()
le_location = LabelEncoder()

df_clean['Company_Encoded'] = le_company.fit_transform(df_clean['Company'])
df_clean['Location_Encoded'] = le_location.fit_transform(df_clean['Company_Location'])

print("✅ Categorical encoding complete!")
df_clean[['Company', 'Company_Encoded', 'Company_Location', 'Location_Encoded']].head()