In [None]:
import os
import pandas as pd

# Path to the folder containing CSV files
folder_path = 'Wine_Stats'

# Load all CSV files into a list of DataFrames
dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Combine all DataFrames into a single DataFrame
wine_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows
print(wine_df.head())


: 

In [None]:
# Explore the structure of the DataFrame
print(wine_df.info())
print(wine_df.describe())

# Remove duplicate records
wine_df = wine_df.drop_duplicates()

# Check for null values
print(wine_df.isnull().sum())

# Remove rows with null values
wine_df = wine_df.dropna()

# Confirm the cleaned DataFrame
print(wine_df.info())


In [None]:
import numpy as np

# Handling outliers: Example for 'Price' column
q1 = wine_df['Price'].quantile(0.25)
q3 = wine_df['Price'].quantile(0.75)
iqr = q3 - q1

# Define limits
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Remove outliers
wine_df = wine_df[(wine_df['Price'] >= lower_bound) & (wine_df['Price'] <= upper_bound)]

# Impute missing values (if any exist after dropping nulls)
# Example: Replace missing ratings with the mean
if wine_df['Rating'].isnull().sum() > 0:
    wine_df['Rating'] = wine_df['Rating'].fillna(wine_df['Rating'].mean())


In [None]:
# Split 'Region' into 'Country' and 'Country_region'
wine_df[['Country', 'Country_region']] = wine_df['Region'].str.split(' / ', n=1, expand=True)

# Create binary columns for 'Food pairings'
food_items = ['Beef', 'Pasta', 'Lamb', 'Poultry', 'Cheese', 'Fish', 'Seafood']  # Add all 21 items
for food in food_items:
    wine_df[food] = wine_df['Food pairings'].apply(lambda x: food in str(x))

# Display updated DataFrame
print(wine_df.head())


In [None]:
# Drop irrelevant columns
wine_df = wine_df.drop(columns=['Region'])

# Final structure
print(wine_df.info())


In [None]:
print(wine_df.isnull().sum())


In [None]:
wine_df.dropna(inplace=True)  # Drops rows with missing values
print(wine_df.isnull().sum())




 Detect Outliers

In [None]:
import matplotlib.pyplot as plt
wine_df[['Price', 'Rating']].boxplot()
plt.show()


Handle Outliers:

In [None]:
q1 = wine_df['Price'].quantile(0.25)
q3 = wine_df['Price'].quantile(0.75)
iqr = q3 - q1

# Define bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Filter the DataFrame
wine_df = wine_df[(wine_df['Price'] >= lower_bound) & (wine_df['Price'] <= upper_bound)]


In [None]:
wine_df[['Price', 'Rating']].boxplot()
plt.show()


In [None]:
print(wine_df.info())
print(wine_df.describe())


In [None]:
# Step 2: Handle 'Food pairings' to create binary columns
# Extract unique food items
food_items = set()
wine_df['Food pairings'].apply(lambda x: food_items.update(eval(x)) if isinstance(x, str) else None)
food_items = list(food_items)
