# Pre Data Preprocessing - Task [002]

## Purpose

1. Splitting text components for better granularity.
2. Normalizing numerical values to ensure consistency.
3. Removing unnecessary symbols or irrelevant text.

The preprocessing will address these specific attributes in the dataset:
- **Style**
- **Characteristics**
- **Price**
- **Capacity**
- **ABV (Alcohol by Volume)**
- **Vintage**

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import datetime
import re

# Load the dataset
file_path = '../datasets/WineDataset.csv'
df = pd.read_csv(file_path)

def convert_to_liters(capacity):
    capacity = str(capacity).strip().upper()
    if 'CL' in capacity:  # Centiliters to Liters
        return float(re.sub(r'[^\d.]', '', capacity)) / 100
    elif 'ML' in capacity:  # Milliliters to Liters
        return float(re.sub(r'[^\d.]', '', capacity)) / 1000
    elif 'LITRE' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    elif 'LTR' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    elif 'L' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    else:
        return ''  # Handle any unknown format

def preprocess_data(df):

    numeric_cols = ['Price', 'ABV', 'Capacity']

    df['Capacity'] = df['Capacity'].apply(convert_to_liters)

    if not df.empty:
        for col in numeric_cols:
            if col in df.columns:
                # Remove non-numeric characters and convert to float
                df[col] = df[col].apply(lambda x: re.sub(r'[^\d.]', '', str(x)).strip() if str(x).strip() else np.nan)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                
                if df[col].notnull().any():  # Check if there's valid data for scaling
                    scaler = MinMaxScaler()
                    df[col] = scaler.fit_transform(df[[col]])
                
                df[col] = df[col].round(3)

        # Clean and split the 'Style' column
        if 'Style' in df.columns:
            df['Style'] = (
                df['Style']
                .str.replace(r'[^\w\s&]', '', regex=True)
                .str.split('&')
                .apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)  # Clean whitespace
            )

            # This code divides the 'Style' array into several columns, each representing a position in that array
            max_len = df['Style'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()

            for i in range(1, max_len + 1):
                df[f'Style {i}'] = df['Style'].apply(lambda x: x[i-1] if isinstance(x, list) and len(x) >= i else '')

            df = df.drop(columns=['Style'])

        # Clean and split the 'Characteristics' column
        if 'Characteristics' in df.columns:
            df['Characteristics'] = (
                df['Characteristics']
                .str.replace(r'[^\w\s,]', '', regex=True)
                .str.split(',') 
                .apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)  # Clean whitespace
            )
            
            # This code divides the 'Characteristics' array into several columns, each representing a position in that array
            max_len = df['Characteristics'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()

            for i in range(1, max_len + 1):
                df[f'Characteristic {i}'] = df['Characteristics'].apply(lambda x: x[i-1] if isinstance(x, list) and len(x) >= i else '')

            df = df.drop(columns=['Characteristics'])
            
        # Clean and normalize the 'Vintage' column
        if 'Vintage' in df.columns:
            current_year = datetime.datetime.now().year

            df['Vintage'] = df['Vintage'].apply(
                lambda x: current_year if str(x).strip().upper() == 'NV' else (int(re.search(r'\d{4}', str(x)).group(0)) if re.search(r'\d{4}', str(x)) else np.nan)
            )

            valid_years = df['Vintage'][df['Vintage'] > 1900]
            if not valid_years.empty:

                min_year = valid_years.min()  
                max_year = current_year

                # Calculates the vintage value based on the max vintage and the current year
                df['Vintage'] = df['Vintage'].apply(
                    lambda x: max(0, (x - max_year) / (min_year - max_year)) if pd.notna(x) else np.nan
                )

                # Round the 'Vintage' values to 2 decimal places
                df['Vintage'] = df['Vintage'].round(2)

    return df

# Preprocess the dataset
df_cleaned = preprocess_data(df)

# Save or display the cleaned dataset
df_cleaned.to_csv('../datasets/cleaned_wines.csv', index=False)
df_cleaned.head()


# Pre Data Preprocessing - Task [92]

## Purpose

1. Merging both updated_wines.csv that has the mean ratings, with the merged_wine_dataset that was the result of Report 2(id:65). Adding the rating of the first dataset to the second.

In [None]:
import pandas as pd

file1 = "../datasets/updated_wines.csv"
file2 = "../datasets/merged_wine_dataset.csv"

df1 = pd.read_csv(file1) 
df2 = pd.read_csv(file2) 

# Merge the datasets based on WineName and WineryName
merged_df = df2.merge(df1[['WineName', 'WineryName', 'Ratings']], on=['WineName', 'WineryName'], how='left')

# Save the new dataset
output_file = "../datasets/PLNTD_dataset.csv"
merged_df.to_csv(output_file, index=False)

print(f"PLNTD_dataset created and saved to {output_file}")

missing_ratings = merged_df[merged_df['Ratings'].isna()]

#Testing purposes
if not missing_ratings.empty:
    print("WARNING: Some rows in the dataset are missing a rating.")
    print(missing_ratings)
else:
    print("SUCCESS: All rows have a rating.")


# ELECTRE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define benefit_criteria


benefit_criteria = ['ABV', 'Ratings', 'Body']
cost_criteria = ['Price', 'Acidity']
print("Benefit Criteria:", benefit_criteria)
print("Cost Criteria:", cost_criteria)

# Convert categorical 'Acidity' to numeric values (encoding)
# Convert categorical 'Acidity' and 'Body' to numeric values (encoding)
acidity_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
body_mapping = {'Very light-bodied': 1, 'Light-bodied': 2, 'Medium-bodied': 3, 'Full-bodied': 4, 'Very full-bodied': 5}

merged_df['Acidity'] = [acidity_mapping[val] for val in merged_df['Acidity']]
merged_df['Body'] = [body_mapping[val] for val in merged_df['Body']]

merged_df['ABV'] = merged_df['ABV'].str.replace('ABV ', '').str.replace('%', '').astype(float)
merged_df['Price'] = merged_df['Price'].str.replace('£', '').str.replace('per bottle', '').astype(float)

columns_to_keep = benefit_criteria + cost_criteria
merged_df = merged_df[columns_to_keep]


for column in merged_df.columns:
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    sns.kdeplot(merged_df[column])
    plt.title(f'Before normalization - {column}')
    plt.tight_layout()
    plt.show()

print(merged_df)


print("Normalizing DataFrame:")
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(merged_df), columns=merged_df.columns)
print(normalized_df)

In [None]:
# Step 2: Calculate concordance matrix
print("Calculating Concordance Matrix...")
# Ensure weights are a numpy array
# Define weights (customize these based on your preferences)
weights = {
    'ABV': 0.15,
    'Ratings': 0.25,
    'Body': 0.2,
    'Price': 0.25,
    'Acidity': 0.15
}
# Define thresholds for each criterion
thresholds = {
    'ABV': {
        'q': 0.05,  # Small ABV differences matter
        'p': 0.15,  # Clear preference for notably different ABVs
        'v': 0.3    # Veto for very different ABVs
    },
    'Ratings': {
        'q': 0.02,  # Even small rating differences matter
        'p': 0.1,   # Clear preference for better rated wines
        'v': 0.25   # Strong veto for very different ratings
    },
    'Body': {
        'q': 0.1,   # Some body difference can be negligible
        'p': 0.2,   # Clear preference for desired body
        'v': 0.4    # Veto only for extreme differences
    },
    'Price': {
        'q': 0.03,  # Small price differences matter
        'p': 0.12,  # Clear preference for better value
        'v': 0.3    # Veto for very expensive differences
    },
    'Acidity': {
        'q': 0.15,  # Less sensitive to acidity differences
        'p': 0.25,  # Need bigger difference for preference
        'v': 0.5    # Veto only for extreme differences
    }
}

# Get the number of alternatives
alternatives = normalized_df.index
n = len(alternatives)
# Initialize the concordance matrix
concordance_matrix = pd.DataFrame(np.zeros((n, n)), index=alternatives, columns=alternatives)
# Calculate the concordance index for each pair of alternatives
for i in range(n):
    for j in range(n):
        if i != j:
            concordance_sum = 0
            for criterion in normalized_df.columns:
                diff = normalized_df.iloc[i][criterion] - normalized_df.iloc[j][criterion]
                
                # Get thresholds for this criterion
                q = thresholds[criterion]['q']  # indifference threshold
                p = thresholds[criterion]['p']  # preference threshold
                
                # Calculate concordance considering criterion type
                if criterion in benefit_criteria:
                    if diff >= -q:
                        concordance_sum += weights[criterion]
                    elif diff < -p:
                        concordance_sum += 0
                    else:
                        concordance_sum += weights[criterion] * (diff + p) / (p - q)
                else:  # cost criteria
                    if diff <= q:
                        concordance_sum += weights[criterion]
                    elif diff > p:
                        concordance_sum += 0
                    else:
                        concordance_sum += weights[criterion] * (p - diff) / (p - q)
            
            concordance_matrix.iloc[i, j] = concordance_sum


In [None]:
# Step 3: Calculate discordance matrix
print("Calculating Discordance Matrix...")
# Calculate the range (d_k) for each criterion
ranges = normalized_df.max() - normalized_df.min()

# Initialize a dictionary to store discordance matrices for each criterion
discordance_matrices = {}

for criterion in normalized_df.columns:
    discordance_matrix = np.zeros((n, n))
    v = thresholds[criterion]['v']  # veto threshold
    p = thresholds[criterion]['p']  # preference threshold
    
    for i in range(n):
        for j in range(n):
            if i != j:
                diff = normalized_df.iloc[i][criterion] - normalized_df.iloc[j][criterion]
                
                if criterion in benefit_criteria:
                    if diff <= -p:
                        discordance_matrix[i, j] = min(1, (-diff - p) / (v - p))
                else:  # cost criteria
                    if diff >= p:
                        discordance_matrix[i, j] = min(1, (diff - p) / (v - p))
                        
    discordance_matrices[criterion] = discordance_matrix


In [None]:
# Calculate credibility matrix
print("Calculating Credibility Matrix...")
credibility_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        if i != j:
            # Get maximum discordance value
            max_disc = max(discordance_matrices[criterion][i,j] for criterion in merged_df.columns)
            
            if max_disc > concordance_matrix.iloc[i,j]:
                credibility_matrix[i,j] = 0
            else:
                product = 1
                for criterion in merged_df.columns:
                    if discordance_matrices[criterion][i,j] > concordance_matrix.iloc[i,j]:
                        product *= (1 - discordance_matrices[criterion][i,j]) / (1 - concordance_matrix.iloc[i,j])
                credibility_matrix[i,j] = concordance_matrix.iloc[i,j] * product

In [None]:
# Step 4: Generate dominance matrix
print("Generating Dominance Matrix...")
c_threshold = 0.6  # Example threshold for concordance
d_threshold = 0.4  # Example threshold for discordance
print(f"Concordance Threshold: {c_threshold}, Discordance Threshold: {d_threshold}")

# Ensure the matrices are numpy arrays
concordance_matrix = np.array(concordance_matrix)
discordance_matrix = np.array(discordance_matrix)
# Get the number of alternatives
n = concordance_matrix.shape[0]
# Initialize the dominance matrix with zeros
dominance_matrix = np.zeros((n, n), dtype=int)
# Iterate over all pairs of alternatives
for i in range(n):
    for j in range(n):
        if i != j:  # Exclude self-comparison
            if (concordance_matrix[i, j] >= c_threshold and
                    discordance_matrix[i, j] <= d_threshold):
                dominance_matrix[i, j] = 1

In [None]:
dominance_scores = dominance_matrix.sum(axis=1)
print("Dominance Scores:")
print(dominance_scores)

ranked_alternatives = np.argsort(-dominance_scores)  # Descending order
print("Ranked Alternatives (Best to Worst):")
print(ranked_alternatives)

In [66]:
# Distillation procedures
def calculate_qualification_score(credibility_matrix, cutoff):
    n = len(credibility_matrix)
    qualification = np.zeros(n)
    
    for i in range(n):
        strength = sum(1 for j in range(n) if credibility_matrix[i,j] > cutoff)
        weakness = sum(1 for j in range(n) if credibility_matrix[j,i] > cutoff)
        qualification[i] = strength - weakness
    
    return qualification

# Descending distillation
def descending_distillation(credibility_matrix):
    n = len(credibility_matrix)
    ranking = []
    remaining = list(range(n))
    
    while remaining:
        cutoff = 0.7  # Initial cutoff value
        while True:
            qualification = calculate_qualification_score(credibility_matrix[np.ix_(remaining, remaining)], cutoff)
            if max(qualification) - min(qualification) > 0:
                break
            cutoff -= 0.05
            if cutoff < 0:
                break
        
        max_qual = max(qualification)
        max_indices = [remaining[i] for i in range(len(qualification)) if qualification[i] == max_qual]
        ranking.extend(max_indices)
        remaining = [i for i in remaining if i not in max_indices]
    
    return ranking

# Ascending distillation
def ascending_distillation(credibility_matrix):
    n = len(credibility_matrix)
    ranking = []
    remaining = list(range(n))
    
    while remaining:
        cutoff = 0.7
        while True:
            qualification = calculate_qualification_score(credibility_matrix[np.ix_(remaining, remaining)], cutoff)
            if max(qualification) - min(qualification) > 0:
                break
            cutoff -= 0.05
            if cutoff < 0:
                break
        
        min_qual = min(qualification)
        min_indices = [remaining[i] for i in range(len(qualification)) if qualification[i] == min_qual]
        ranking = min_indices + ranking
        remaining = [i for i in remaining if i not in min_indices]
    
    return ranking

# Perform both distillations
descending_ranking = descending_distillation(credibility_matrix)
ascending_ranking = ascending_distillation(credibility_matrix)

# Final ranking (median position from both rankings)
final_ranking = []
for i in range(n):
    desc_pos = descending_ranking.index(i)
    asc_pos = ascending_ranking.index(i)
    final_ranking.append((i, (desc_pos + asc_pos) / 2))

final_ranking.sort(key=lambda x: x[1])

In [None]:
# Improved visualization function
def visualize_electre_results(concordance_matrix, credibility_matrix, final_ranking, normalized_df):
    plt.figure(figsize=(20, 10))
    
    # Concordance matrix heatmap
    plt.subplot(231)
    sns.heatmap(concordance_matrix, cmap='YlOrRd', center=0.5)
    plt.title('Concordance Matrix')
    
    # Credibility matrix heatmap
    plt.subplot(232)
    sns.heatmap(credibility_matrix, cmap='YlOrRd')
    plt.title('Credibility Matrix')
    
    # Ranking visualization
    plt.subplot(233)
    ranking_df = pd.DataFrame([x[1] for x in final_ranking], 
                            index=[x[0] for x in final_ranking])
    sns.barplot(x=ranking_df.index, y=ranking_df[0], color='skyblue')
    plt.title('Final Ranking (lower is better)')
    plt.xticks(rotation=45)
    
    # Distribution of criteria values for top 5 alternatives
    plt.subplot(234)
    top_5 = [x[0] for x in final_ranking[:5]]
    for criterion in normalized_df.columns:
        plt.plot(normalized_df.loc[top_5, criterion], label=criterion)
    plt.legend()
    plt.title('Criteria Values for Top 5 Alternatives')
    
    # Qualification scores distribution
    plt.subplot(235)
    qualification_scores = calculate_qualification_scores(credibility_matrix)
    sns.histplot(qualification_scores, bins=20)
    plt.title('Distribution of Qualification Scores')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed ranking information
    print("\nDetailed Ranking Information:")
    print("Top 10 Alternatives:")
    for rank, (alt_idx, score) in enumerate(final_ranking[:10], 1):
        print(f"Rank {rank}:")
        print(f"  Alternative {alt_idx}")
        print(f"  Score: {score:.3f}")
        print("  Criteria values:")
        for criterion in normalized_df.columns:
            print(f"    {criterion}: {normalized_df.iloc[alt_idx][criterion]:.3f}")
        print()
    
def calculate_qualification_scores(credibility_matrix, cutoff=0.6):
    n = len(credibility_matrix)
    qualification = np.zeros(n)
    
    for i in range(n):
        strength = sum(1 for j in range(n) if credibility_matrix[i,j] > cutoff)
        weakness = sum(1 for j in range(n) if credibility_matrix[j,i] > cutoff)
        qualification[i] = strength - weakness
    
    return qualification

visualize_electre_results(concordance_matrix, credibility_matrix, final_ranking, normalized_df)

In [None]:
# Sensitivity Analysis
def perform_sensitivity_analysis(base_weights, num_variations=10):
    results = []
    for _ in range(num_variations):
        # Create variation in weights
        variation = np.random.normal(0, 0.05, len(base_weights))  # 5% standard deviation
        new_weights = {k: max(0.01, w + v) for (k, w), v in zip(base_weights.items(), variation)}
        # Normalize to sum to 1
        total = sum(new_weights.values())
        new_weights = {k: w/total for k, w in new_weights.items()}
        
        # Recalculate everything with new weights
        # (This would be a repeat of the main calculation with new weights)
        # For brevity, we'll just store the weights and final ranking
        results.append(new_weights)
    
    return results

sensitivity_results = perform_sensitivity_analysis(weights)
print("\nSensitivity Analysis Results:")
print("Number of weight variations tested:", len(sensitivity_results))

In [None]:
#   import pandas as pd
#   import numpy as np
#   from sklearn import preprocessing
#   import matplotlib.pyplot as plt
#   import seaborn as sns
#   
#   # Your initial data preparation
#   benefit_criteria = ['ABV', 'Ratings', 'Body']
#   cost_criteria = ['Price', 'Acidity']
#   all_criteria = benefit_criteria + cost_criteria
#   
#   # Create direction dictionary (1 for benefit/maximization, 0 for cost/minimization)
#   direction = {}
#   for criterion in benefit_criteria:
#       direction[criterion] = 1
#   for criterion in cost_criteria:
#       direction[criterion] = 0
#   
#   # Copy the dataframe for normalization
#   n_table = merged_df.copy()
#   
#   # Using normalization rule 2 (adapted from the original code)
#   denom = dict(merged_df.apply(lambda x: x.max() - x.min()))
#   _min = dict(merged_df.apply(lambda x: x.min()))
#   _max = dict(merged_df.apply(lambda x: x.max()))
#   
#   print(_min)
#   print(_max)
#   
#   # Normalize each column based on whether it's a benefit or cost criterion
#   for column in merged_df.columns:
#       if direction[column]:  # benefit criteria (maximize)
#           n_table[column] = merged_df[column].apply(lambda x: (x - _min[column]) / denom[column])
#       else:  # cost criteria (minimize)
#           n_table[column] = merged_df[column].apply(lambda x: (_max[column] - x) / denom[column])
#   
#   _n_denom = dict(n_table.apply(lambda x: x.max() - x.min()))
#   _n_min = dict(n_table.apply(lambda x: x.min()))
#   _n_max = dict(n_table.apply(lambda x: x.max()))
#   
#   print(_n_denom)
#   print(_n_min)
#   print(_n_max)
#   
#   # Visualization of before and after normalization
#   for column in merged_df.columns:
#       plt.figure(figsize=(12, 4))
#       plt.subplot(1, 2, 1)
#       sns.kdeplot(merged_df[column])
#       plt.title(f'Before normalization - {column}')
#       plt.subplot(1, 2, 2)
#       sns.kdeplot(n_table[column])
#       plt.title(f'After normalization - {column}')
#       plt.tight_layout()
#       plt.show()
#   
#   # Define weights (equal weights for simplicity, can be modified)
#   weights = {criterion: 1/len(all_criteria) for criterion in all_criteria}
#   
#   # Calculate concordance matrix
#   c_matrix = pd.DataFrame(columns=n_table.index, index=n_table.index)
#   
#   for option in n_table.index:
#       for option2 in n_table.index:
#           _sum = 0
#           for criterion in n_table.columns:
#               if n_table.loc[option, criterion] > n_table.loc[option2, criterion]:
#                   _sum += weights[criterion]
#               elif np.isclose(n_table.loc[option, criterion], n_table.loc[option2, criterion]):
#                   _sum += 0.5 * weights[criterion]
#           if option == option2:
#               c_matrix.loc[option, option2] = 0
#           else:
#               c_matrix.loc[option, option2] = _sum
#   
#   # Calculate discordance matrix
#   d_matrix = pd.DataFrame(columns=n_table.index, index=n_table.index)
#   
#   for option in n_table.index:
#       for option2 in n_table.index:
#           diffs = list(n_table.loc[option, :] - n_table.loc[option2, :])
#           if not any(diffs):
#               _discordance_index = 0
#           else:
#               n_diffs = [x for x in diffs if x < 0]
#               if not n_diffs:
#                   num = 0
#               else:
#                   num = max(np.abs(n_diffs))
#               denom = max(np.abs(diffs))
#               _discordance_index = num / denom
#           d_matrix.loc[option, option2] = _discordance_index
#   
#   # Calculate aggregated dominance matrix
#   a_matrix = pd.DataFrame(columns=c_matrix.columns, index=c_matrix.index)
#   
#   # Define thresholds (can be modified)
#   concordance_threshold = 0.7
#   discordance_threshold = 0.3
#   
#   for option in c_matrix.columns:
#       for option2 in c_matrix.index:
#           a_matrix.loc[option, option2] = 1 if (c_matrix.loc[option, option2] >= concordance_threshold and 
#                                               d_matrix.loc[option, option2] <= discordance_threshold) else 0
#   
#   # Calculate net dominance scores
#   dominance_scores = a_matrix.sum(axis=1) - a_matrix.sum(axis=0)
#   rankings = dominance_scores.sort_values(ascending=False)
#   
#   print("\nFinal Rankings:")
#   print(rankings)

In [None]:
#   credibility_matrix = np.zeros((num_alternatives, num_alternatives))
#   
#   for i in range(num_alternatives):
#       for j in range(num_alternatives):
#           if i != j:
#               if discordance_matrix[i][j] > concordance_matrix[i][j]:
#                   credibility_matrix[i][j] = concordance_matrix[i][j] * \
#                       (1 - discordance_matrix[i][j]) / (1 - concordance_matrix[i][j])
#               else:
#                   credibility_matrix[i][j] = concordance_matrix[i][j]
#   
#   credibility_df = pd.DataFrame(
#       credibility_matrix,
#       index=alternatives,
#       columns=alternatives
#   )
#   print("\nCredibility Matrix:")
#   print(credibility_df)
#   
#   # Step 4: Calculate Net Credibility Scores
#   net_scores = np.zeros(num_alternatives)
#   
#   for i in range(num_alternatives):
#       outgoing = np.sum(credibility_matrix[i, :])
#       incoming = np.sum(credibility_matrix[:, i])
#       net_scores[i] = outgoing - incoming

In [None]:
# 1. Heatmap of Credibility Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(credibility_df, annot=True, cmap='YlOrRd', center=0)
plt.title('Credibility Matrix Heatmap')
plt.tight_layout()
plt.show()

# 2. Bar plot of Net Scores
plt.figure(figsize=(10, 6))
plt.bar(alternatives, net_scores)
plt.title('Net Credibility Scores by Alternative')
plt.xlabel('Alternatives')
plt.ylabel('Net Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Rankings visualization
plt.figure(figsize=(10, 6))
plt.bar(rankings.index, rankings.values)
plt.title('Alternatives Ranked by Net Credibility Score')
plt.xlabel('Alternatives')
plt.ylabel('Net Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()