# Exploratory Data Analysis (EDA) of 

- Q1 ($12.99): 25% of prices are below this
- Q3 ($29.99): 75% of prices are below this
- IQR ($17.00): Range containing middle 50% of prices (Q3-Q1)

Red and White types are the clear market leaders, each with almost 550 units

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MinMaxScaler
import category_encoders as ce
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from ftfy import fix_text




# Seaborn and Matplotlib configurations for modern aesthetics
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12

# Load dataset
df = pd.read_csv('../datasets/WineDataset.csv')

# Continue with your cleaning steps...
for col in df.columns:
    if df[col].dtype == 'object': 
        df[col] = df[col].apply(lambda x: fix_text(x) if isinstance(x, str) else x)

# Print after cleaning to verify data still exists
print("\nAfter cleaning:")
print("Number of rows:", len(df))

# Select relevant columns
df = df[['Type', 'Grape', 'Price']]

# Clean the 'Price' column by extracting numerical values
df['Price'] = df['Price'].str.replace('£', '', regex=False)        
df['Price'] = df['Price'].str.replace('per bottle', '', regex=False) 

# Remover espaços extras e converter para float
df['Price'] = df['Price'].str.strip()
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Drop rows with missing or invalid prices
df = df.dropna(subset=['Price'])

# Drop rows with missing values in categorical columns
df = df.dropna(subset=['Type', 'Grape'])



one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
type_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[['Type']]), columns=one_hot_encoder.get_feature_names_out(['Type']))

# Inicializar o BinaryEncoder
binary_encoder = ce.BinaryEncoder(cols=['Grape'])

# Aplicar a codificação binária
grape_encoded = binary_encoder.fit_transform(df['Grape'])

# Combine all encoded data into a single dataframe
df_encoded = pd.concat([df[['Price']], type_encoded, grape_encoded], axis=1)
df_encoded = df_encoded.dropna()  # Assign the result back

scaler = MinMaxScaler()
df_encoded['Price'] = scaler.fit_transform(df_encoded[['Price']])

print(df_encoded)

# Plot distribution of Price
sns.histplot(df['Price'], kde=True, bins=30, color='blue')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Plot Type Distribution
sns.countplot(x='Type', data=df, palette='cool')
plt.title('Type Distribution')
plt.xlabel('Type')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Correlation Matrix Visualization
correlation_matrix = df_encoded.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Detecting Outliers using Boxplot

# Calculate IQR
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

# Calculate whisker boundaries
whisker_min = Q1 - 1.5 * IQR 
whisker_max = Q3 + 1.5 * IQR

# Print IQR statistics
print(f'Q1: {Q1}')
print(f'Q3: {Q3}')
print(f'IQR: {IQR}')
print(f'Lower whisker: {whisker_min}')
print(f'Upper whisker: {whisker_max}')

# Boxplot
plt.figure(figsize=(10, 6))
max_price = df['Price'].max()
plt.xticks(np.arange(0, max_price + 50, 50))  # Para intervalos de 50
sns.boxplot(x=df['Price'], color='red')
plt.title('Outliers in Price')
plt.xlabel('Price')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

df_encoded['Price'] = df['Price']

df_encoded_clean = df_encoded.dropna()
features = df_encoded_clean.drop(columns=['Price']).values
labels = df_encoded_clean['Price'].values

pca = PCA(n_components=2)
df_pca = pca.fit_transform(features)

unique_labels = np.unique(labels)
step = 10

for i in range(0, len(unique_labels), step):
    plt.figure(figsize=(8, 6))
    subset_labels = unique_labels[i:i + step]
    
    for label in subset_labels:
        indices = np.where(labels == label)[0]
        plt.scatter(
            df_pca[indices, 0],
            df_pca[indices, 1],
            label=f'Price: {label:.4f}'
        )
    
    plt.title('PCA (2D) - Coloreado por Precio')
    plt.xlabel('Componente Principal 1')
    plt.ylabel('Componente Principal 2')
    plt.legend(title='Rango de Precios', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
df_encoded = df_encoded.dropna()  # Assign the result back

scaler = MinMaxScaler()
df_encoded['Price'] = scaler.fit_transform(df_encoded[['Price']])

print(df_encoded)