In [None]:
import pandas as pd
from pathlib import Path

# Define the path to the CSV folder
csv_path = Path("data")
dataframes = []

csv_files = sorted(csv_path.glob("*.csv"))[:8]

for csv_file in csv_files:
    print(f"Loading: {csv_file.name}")

    # Assume comma-separated (standard CSV)
    df = pd.read_csv(csv_file, encoding="utf-8", sep=",")
        
    # Remove unnamed index columns (e.g., from Excel)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        
    # Clean up column names
    df.columns = df.columns.str.strip()
        
    # Add a country column based on the file name
    df['country'] = csv_file.stem.capitalize()

    dataframes.append(df)

    # Combine all DataFrames
    wine_df = pd.concat(dataframes, ignore_index=True)

    # Print the final result
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        print(wine_df)


In [None]:
# 1. Examine structure and data types
print("\nDataFrame Info:")
print(wine_df.info())

# Show basic stats
print("\nSummary Statistics:")
print(wine_df.describe(include='all'))

# Show unique columns
print("\nColumns in the DataFrame:")
print(wine_df.columns.tolist())

# 2. Remove duplicates
before = wine_df.shape[0]
wine_df = wine_df.drop_duplicates()
after = wine_df.shape[0]
print(f"\nRemoved {before - after} duplicate rows.")

# 3. Remove rows with any null values (NaNs)
null_count = wine_df.isnull().sum().sum()
print(f"\nTotal missing values before cleaning: {null_count}")

wine_df = wine_df.dropna()

print(f"Final shape after cleaning: {wine_df.shape}")

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
    print(wine_df)