# DataSet Clean

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
import numpy as np

In [2]:
def clean_wine_data(file_path):
    """
    Cleans the wine quality dataset by fixing the column headers and reformatting the data.

    Parameters:
    file_path (str): The file path to the original wine quality data CSV.

    Returns:
    None: This function saves the cleaned data into a CSV file.
    """

    # Define the path to the original data file
    original_file_path = file_path  

    # Read the data, first check the original format of the file
    with open(original_file_path, 'r') as file:
        lines = file.readlines()

    # Manually process the column headers and data to ensure each field is parsed correctly
    # Fix the headers by removing extra quotes
    fixed_columns = [col.replace('"', '') for col in lines[0].strip().split(';')]

    # Create a DataFrame
    # Re-process the raw data and merge into a string, use io.StringIO to simulate a file object
    data_string = "".join(lines)
    data_io = io.StringIO(data_string)

    # Reload the data, this time ensuring to use the correct column names
    data_corrected = pd.read_csv(data_io, delimiter=';', names=fixed_columns, skiprows=1)

    # Display the data to ensure it's loaded correctly
    print(data_corrected.head())

    # Save the processed DataFrame as a CSV file
    processed_csv_path = 'processed_wine_quality.csv'
    data_corrected.to_csv(processed_csv_path, index=False)

In [3]:
clean_wine_data('winequality-red-3.csv')

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [4]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import numpy as np

def explore_wine_data(csv_path):
    """
    Explore the wine quality data through various visualizations and save the results in structured folders.

    Parameters:
    csv_path (str): The file path to the processed wine quality data CSV.

    Returns:
    None: This function generates and saves plots and CSV files for data exploration.
    """

    # Read the cleaned data from the CSV file
    data = pd.read_csv(csv_path)

    # Define main directory for plots
    main_plots_dir = 'wine_quality_exploration'
    if not os.path.exists(main_plots_dir):
        os.makedirs(main_plots_dir)

    # Define subdirectories for different types of plots
    subfolders = {
        'distributions': 'Distribution Plots',
        'correlations': 'Correlation Matrix',
        'boxplots': 'Boxplots by Quality',
        'violin_plots': 'Violin Plots by Quality',
        'log_transforms': 'Log Transformed Distributions',
        'pair_plots': 'Pair Plots',
        'averages': 'Average Values by Quality'
    }

    # Create subdirectories
    for folder in subfolders.values():
        folder_path = os.path.join(main_plots_dir, folder)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

    # Define combination directory within main directory
    combination_dir = os.path.join(main_plots_dir, 'combination')
    if not os.path.exists(combination_dir):
        os.makedirs(combination_dir)

    # Generate and save plots for each data exploration method
    # Distribution plots
    for column in data.columns:
        plt.figure(figsize=(10, 4))
        sns.histplot(data[column], kde=True, element="step", color='blue')
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.grid(True)
        plot_path = os.path.join(main_plots_dir, subfolders['distributions'], f'{column}_distribution.png')
        plt.savefig(plot_path)
        plt.close()

    # Correlation matrix plot
    correlation_matrix = data.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
    plt.title('Correlation Matrix of Wine Quality Attributes')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    heatmap_path = os.path.join(main_plots_dir, subfolders['correlations'], 'correlation_matrix_heatmap.png')
    plt.savefig(heatmap_path)
    plt.close()

    # Boxplots and violin plots by wine quality
    for column in data.columns[:-1]:  # Assuming the last column is 'quality'
        plt.figure(figsize=(8, 4))
        sns.boxplot(x='quality', y=column, data=data)
        plt.title(f'Boxplot of {column} by Wine Quality')
        boxplot_path = os.path.join(main_plots_dir, subfolders['boxplots'], f'{column}_boxplot.png')
        plt.savefig(boxplot_path)
        plt.close()

        plt.figure(figsize=(8, 4))
        sns.violinplot(x='quality', y=column, data=data)
        plt.title(f'Violin Plot of {column} by Wine Quality')
        violinplot_path = os.path.join(main_plots_dir, subfolders['violin_plots'], f'{column}_violinplot.png')
        plt.savefig(violinplot_path)
        plt.close()

    # Log transformed histograms
    for column in data.columns[:-1]:
        data[f'log_{column}'] = np.log1p(data[column])
        plt.figure(figsize=(10, 4))
        sns.histplot(data[f'log_{column}'], kde=True, element="step", bins=30)
        plt.title(f'Log Transformed Distribution of {column}')
        log_dist_path = os.path.join(main_plots_dir, subfolders['log_transforms'], f'log_{column}_distribution.png')
        plt.savefig(log_dist_path)
        plt.close()

    # Pair plots for selected features
    sns.pairplot(data[['alcohol', 'volatile acidity', 'sulphates', 'quality']], hue='quality', corner=True)
    pairplot_path = os.path.join(main_plots_dir, subfolders['pair_plots'], 'pairplot_selected_features.png')
    plt.savefig(pairplot_path)
    plt.close()

    # Calculate averages by quality and save to CSV
    quality_grouped = data.groupby('quality').mean()
    quality_means_path = os.path.join(main_plots_dir, subfolders['averages'], 'quality_grouped_means.csv')
    quality_grouped.to_csv(quality_means_path, index=False)

def combine_plots_to_montage(source_folder, output_folder, output_filename, columns=6):
    """
    Combine all .png images in a source folder into a single montage image and save it to the specified output folder.

    Parameters:
    source_folder (str): The folder containing the individual .png image files.
    output_folder (str): The folder where the combined image will be saved.
    output_filename (str): The filename for the combined image.
    columns (int): Number of images per row in the montage.

    Returns:
    None: The function saves the combined image as a file.
    """
    # List all PNG files in the source folder
    plot_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.endswith('.png')]
    if not plot_files:
        print("No PNG files found in the directory.")
        return

    # Open all the images
    plot_images = [Image.open(f) for f in plot_files]

    # Determine the size of each image
    width, height = plot_images[0].size

    # Calculate the dimensions of the montage image
    num_images = len(plot_images)
    rows = (num_images + columns - 1) // columns  # Ceiling division
    montage_width = width * columns
    montage_height = height * rows

    # Create a new image to hold the montage
    montage_image = Image.new('RGB', (montage_width, montage_height))

    # Paste images into the montage
    for i, image in enumerate(plot_images):
        x = (i % columns) * width
        y = (i // columns) * height
        montage_image.paste(image, (x, y))

    # Save the montage image
    output_path = os.path.join(output_folder, output_filename)
    montage_image.save(output_path)
    print(f"Montage image saved to {output_path}")

In [5]:
explore_wine_data('processed_wine_quality.csv')
combine_plots_to_montage('wine_quality_exploration/Distribution Plots', 'wine_quality_exploration/combination', 'combined_distribution.png')

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mo

Montage image saved to wine_quality_exploration/combination\combined_distribution.png
