In [1]:
import pandas as pd

# Load the three probability files
max_prob_df = pd.read_csv('Max_Probability.csv')
min_prob_df = pd.read_csv('Min_Probability.csv')
mean_prob_df = pd.read_csv('Mean_Probability.csv')

# Assume 'Song' is the identifier and probability columns are named 'Cluster_0', 'Cluster_1', etc.
song_names = max_prob_df['Song']
prob_cols = [col for col in max_prob_df.columns if 'Cluster' in col]

# Initialize DataFrame to store results
result_df = pd.DataFrame({'Song': song_names, 'Source_File': '', 'GMM Cluster': ''})

# Iterate over each row to find the file with the highest probability for each song
for i in range(len(song_names)):
    row_max = max_prob_df.iloc[i][prob_cols]
    row_min = min_prob_df.iloc[i][prob_cols]
    row_mean = mean_prob_df.iloc[i][prob_cols]

    # Get the maximum probabilities for the row from each file
    max_value = row_max.max()
    min_value = row_min.max()
    mean_value = row_mean.max()

    # Determine which file has the highest probability and assign that file's cluster
    if max_value >= min_value and max_value >= mean_value:
        result_df.at[i, 'Source_File'] = 'Max_Probability.csv'
        result_df.at[i, 'GMM Cluster'] = row_max.idxmax()[-1]
    elif min_value >= max_value and min_value >= mean_value:
        result_df.at[i, 'Source_File'] = 'Min_Probability.csv'
        result_df.at[i, 'GMM Cluster'] = row_min.idxmax()[-1]
    else:
        result_df.at[i, 'Source_File'] = 'Mean_Probability.csv'
        result_df.at[i, 'GMM Cluster'] =  row_mean.idxmax()[-1]

# Save results to CSV
result_df.to_csv('Almost_Final_Cluster_Assignments.csv')

# Display result
print(result_df)


               Song           Source_File GMM Cluster
0    mfcc_output_49   Max_Probability.csv           5
1    mfcc_output_61   Max_Probability.csv           2
2    mfcc_output_75  Mean_Probability.csv           2
3    mfcc_output_74   Max_Probability.csv           2
4    mfcc_output_60  Mean_Probability.csv           1
..              ...                   ...         ...
110  mfcc_output_93  Mean_Probability.csv           5
111  mfcc_output_87   Max_Probability.csv           2
112  mfcc_output_78   Max_Probability.csv           2
113  mfcc_output_50   Min_Probability.csv           1
114  mfcc_output_44   Min_Probability.csv           3

[115 rows x 3 columns]


In [21]:
import pandas as pd

# Load the three CSV files
df1 = pd.read_csv('Output_GMM_max.csv')
df2 = pd.read_csv('Output_GMM_Min.csv')
df3 = pd.read_csv('Output_GMM_Mean.csv')

# Ensure all DataFrames have the 'GMM Cluster' and 'Song' columns
required_columns = {'GMM Cluster', 'Song'}
if required_columns.issubset(df1.columns) and required_columns.issubset(df2.columns) and required_columns.issubset(df3.columns):

    # Merge pairwise tables to find common rows
    df1_df2 = pd.merge(df1[['Song', 'GMM Cluster']], df2[['Song', 'GMM Cluster']], 
                       on=['Song', 'GMM Cluster'], how='inner')
    df2_df3 = pd.merge(df2[['Song', 'GMM Cluster']], df3[['Song', 'GMM Cluster']], 
                       on=['Song', 'GMM Cluster'], how='inner')
    df1_df3 = pd.merge(df1[['Song', 'GMM Cluster']], df3[['Song', 'GMM Cluster']], 
                       on=['Song', 'GMM Cluster'], how='inner')
    
    # Combine all matching rows across pairwise matches
    combined_matches = pd.concat([df1_df2, df2_df3, df1_df3]).drop_duplicates()
    combined_matches.insert(1, 'Source_File', 'Majority_Vote')
    print(combined_matches.head())
    # Count the number of unique matching rows across at least two tables
    matching_count = combined_matches.shape[0]
    print(f"Number of rows with at least two tables matching on 'GMM Cluster' and 'Song': {matching_count}")
    combined_matches.to_csv('Majority_Vote.csv')
else:
    print("All CSV files must have 'GMM Cluster' and 'Song' columns.")


             Song    Source_File  GMM Cluster
0  mfcc_output_49  Majority_Vote            5
1  mfcc_output_61  Majority_Vote            2
2  mfcc_output_63  Majority_Vote            4
3  mfcc_output_73  Majority_Vote            0
4  mfcc_output_59  Majority_Vote            1
Number of rows with at least two tables matching on 'GMM Cluster' and 'Song': 88


In [10]:
import pandas as pd

# Load the CSV files
almost_final = pd.read_csv('Almost_Final_Cluster_Assignments.csv')
majority_vote = pd.read_csv('Majority_Vote.csv')

# Replace the GMM Cluster and Source_File where the Song matches
for index, row in almost_final.iterrows():
    # Find matching song in Majority Vote
    match = majority_vote[majority_vote['Song'] == row['Song']]
    if not match.empty:
        almost_final.at[index, 'GMM Cluster'] = match.iloc[0]['GMM Cluster']  # Replace GMM Cluster
        almost_final.at[index, 'Source_File'] = 'Majority_Vote'  # Change Source_File

# Save the modified DataFrame to a new CSV
almost_final.to_csv('Final.csv', index=False)

print("final.csv has been created.")


final.csv has been created.


In [12]:
import pandas as pd

# Load the final CSV file
final_df = pd.read_csv('final.csv')

# Count unique values in each column
source_file_counts = final_df['Source_File'].value_counts()
gmm_cluster_counts = final_df['GMM Cluster'].value_counts()

# Display the counts
print("\nCounts of Source Files:")
print(source_file_counts)

print("\nCounts of GMM Clusters:")
print(gmm_cluster_counts)


Counts of Source Files:
Source_File
Majority_Vote           88
Max_Probability.csv     10
Mean_Probability.csv     9
Min_Probability.csv      8
Name: count, dtype: int64

Counts of GMM Clusters:
GMM Cluster
5    28
2    25
0    24
4    22
3    12
1     4
Name: count, dtype: int64
