In [1]:
import pandas as pd

# Load the three probability files
max_prob_df = pd.read_csv('Max_Probability.csv')
min_prob_df = pd.read_csv('Min_Probability.csv')
mean_prob_df = pd.read_csv('Mean_Probability.csv')

# Assume 'Song' is the identifier, and probability columns are named by cluster names
song_names = max_prob_df['Song']
prob_cols = [col for col in max_prob_df.columns if col != 'Song']  # Extract all cluster name columns

# Initialize DataFrame to store results
result_df = pd.DataFrame({'Song': song_names, 'Source_File': '', 'GMM Cluster Name': ''})

# Iterate over each row to find the file with the highest probability for each song
for i in range(len(song_names)):
    row_max = max_prob_df.iloc[i][prob_cols]
    row_min = min_prob_df.iloc[i][prob_cols]
    row_mean = mean_prob_df.iloc[i][prob_cols]

    # Get the maximum probabilities for the row from each file
    max_value = row_max.max()
    min_value = row_min.max()
    mean_value = row_mean.max()

    # Determine which file has the highest probability and assign that file's cluster name
    if max_value >= min_value and max_value >= mean_value:
        result_df.at[i, 'Source_File'] = 'Max_Probability.csv'
        result_df.at[i, 'GMM Cluster Name'] = row_max.idxmax()  # Use the cluster name as is
    elif min_value >= max_value and min_value >= mean_value:
        result_df.at[i, 'Source_File'] = 'Min_Probability.csv'
        result_df.at[i, 'GMM Cluster Name'] = row_min.idxmax()  # Use the cluster name as is
    else:
        result_df.at[i, 'Source_File'] = 'Mean_Probability.csv'
        result_df.at[i, 'GMM Cluster Name'] = row_mean.idxmax()  # Use the cluster name as is

# Save results to CSV
result_df.to_csv('Almost_Final_Cluster_Assignments.csv', index=False)

# Display result
print(result_df)


               Song           Source_File        GMM Cluster Name
0    mfcc_output_49   Min_Probability.csv       Marathi Bhav Geet
1    mfcc_output_61  Mean_Probability.csv  Indian National Anthem
2    mfcc_output_75  Mean_Probability.csv  Indian National Anthem
3    mfcc_output_74   Max_Probability.csv  Indian National Anthem
4    mfcc_output_60  Mean_Probability.csv       Marathi Bhav Geet
..              ...                   ...                     ...
110  mfcc_output_93   Min_Probability.csv      Asha Bhosale Songs
111  mfcc_output_87   Max_Probability.csv  Indian National Anthem
112  mfcc_output_78   Max_Probability.csv  Indian National Anthem
113  mfcc_output_50   Min_Probability.csv      Kishor Kumar Songs
114  mfcc_output_44   Min_Probability.csv   Michael Jackson Songs

[115 rows x 3 columns]


In [2]:
import pandas as pd

# Load the three CSV files
df1 = pd.read_csv('Output_GMM_max.csv')
df2 = pd.read_csv('Output_GMM_Min.csv')
df3 = pd.read_csv('Output_GMM_Mean.csv')

# Ensure all DataFrames have the 'GMM Cluster Name' and 'Song' columns
required_columns = {'GMM Cluster Name', 'Song'}
if required_columns.issubset(df1.columns) and required_columns.issubset(df2.columns) and required_columns.issubset(df3.columns):

    # Merge pairwise tables to find common rows
    df1_df2 = pd.merge(df1[['Song', 'GMM Cluster Name']], df2[['Song', 'GMM Cluster Name']], 
                       on=['Song', 'GMM Cluster Name'], how='inner')
    df2_df3 = pd.merge(df2[['Song', 'GMM Cluster Name']], df3[['Song', 'GMM Cluster Name']], 
                       on=['Song', 'GMM Cluster Name'], how='inner')
    df1_df3 = pd.merge(df1[['Song', 'GMM Cluster Name']], df3[['Song', 'GMM Cluster Name']], 
                       on=['Song', 'GMM Cluster Name'], how='inner')
    
    # Combine all matching rows across pairwise matches
    combined_matches = pd.concat([df1_df2, df2_df3, df1_df3]).drop_duplicates()
    combined_matches.insert(1, 'Source_File', 'Majority_Vote')
    print(combined_matches.head())
    
    # Count the number of unique matching rows across at least two tables
    matching_count = combined_matches.shape[0]
    print(f"Number of rows with at least two tables matching on 'GMM Cluster Name' and 'Song': {matching_count}")
    
    # Save the result to a CSV file
    combined_matches.to_csv('Majority_Vote.csv', index=False)
else:
    print("All CSV files must have 'GMM Cluster Name' and 'Song' columns.")


             Song    Source_File        GMM Cluster Name
0  mfcc_output_49  Majority_Vote       Marathi Bhav Geet
1  mfcc_output_61  Majority_Vote  Indian National Anthem
2  mfcc_output_60  Majority_Vote      Asha Bhosale Songs
3  mfcc_output_48  Majority_Vote       Marathi Bhav Geet
4  mfcc_output_76  Majority_Vote       Marathi Bhav Geet
Number of rows with at least two tables matching on 'GMM Cluster Name' and 'Song': 92


In [3]:
import pandas as pd

# Load the CSV files
almost_final = pd.read_csv('Almost_Final_Cluster_Assignments.csv')
majority_vote = pd.read_csv('Majority_Vote.csv')

# Replace the GMM Cluster Name and Source_File where the Song matches
for index, row in almost_final.iterrows():
    # Find matching song in Majority Vote
    match = majority_vote[majority_vote['Song'] == row['Song']]
    if not match.empty:
        # Replace GMM Cluster Name
        almost_final.at[index, 'GMM Cluster Name'] = match.iloc[0]['GMM Cluster Name']  
        # Change Source_File to 'Majority_Vote'
        almost_final.at[index, 'Source_File'] = 'Majority_Vote'  

# Save the modified DataFrame to a new CSV
almost_final.to_csv('Final.csv', index=False)

print("Final.csv has been created.")


Final.csv has been created.


In [4]:
import pandas as pd

# Load the final CSV file
final_df = pd.read_csv('Final.csv')

# Count unique values in each column
source_file_counts = final_df['Source_File'].value_counts()
gmm_cluster_name_counts = final_df['GMM Cluster Name'].value_counts()

# Display the counts
print("\nCounts of Source Files:")
print(source_file_counts)

print("\nCounts of GMM Cluster Names:")
print(gmm_cluster_name_counts)



Counts of Source Files:
Source_File
Majority_Vote           92
Min_Probability.csv     12
Max_Probability.csv      8
Mean_Probability.csv     3
Name: count, dtype: int64

Counts of GMM Cluster Names:
GMM Cluster Name
Marathi Bhav Geet         31
Indian National Anthem    24
Marathi Lavni             17
Asha Bhosale Songs        15
Kishor Kumar Songs        15
Michael Jackson Songs     13
Name: count, dtype: int64
