In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


#Text

**SVM**

In [2]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

# Load the Malayalam dataset
train_data = pd.read_csv("/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Malayalam/ML-AT-test.csv")

# Step 1: Preprocess Malayalam text
def preprocess_text(text):
    # Add more preprocessing steps as needed for Malayalam
    text = text.lower()  # Convert to lowercase
    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove punctuation
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract CountVectorizer features from text
vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

# Step 6: Train an SVM model
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_model.fit(X_train_cv, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = svm_model.predict(X_test_cv)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("CV_SVM_prediction.tsv", sep='\t', index=False)
print("Predictions saved to CV_SVM_prediction.tsv")


Class distribution: Class Label Short
N    406
C    186
P    118
R     91
G     82
Name: count, dtype: int64
Class weights: {'C': 0.9494623655913978, 'G': 2.153658536585366, 'N': 0.4349753694581281, 'P': 1.4966101694915255, 'R': 1.9406593406593406}
Predictions saved to CV_SVM_prediction.tsv


**Logistic Regression**

In [4]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Load the Malayalam dataset
train_data = pd.read_csv("/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Malayalam/ML-AT-test.csv")

# Step 1: Preprocess Malayalam text
def preprocess_text(text):
    # Add more preprocessing steps as needed for Malayalam
    text = text.lower()  # Convert to lowercase
    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove punctuation
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract TF-IDF features from text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Train an SVM model
model = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)
model.fit(X_train_tfidf, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = svm_model.predict(X_test_tfidf)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("Logistic_prediction.tsv", sep='\t', index=False)
print("Predictions saved to Logistic_prediction.tsv")


Class distribution: Class Label Short
N    406
C    186
P    118
R     91
G     82
Name: count, dtype: int64
Class weights: {'C': 0.9494623655913978, 'G': 2.153658536585366, 'N': 0.4349753694581281, 'P': 1.4966101694915255, 'R': 1.9406593406593406}
Predictions saved to Logistic_prediction.tsv


**MLP classifier**

In [5]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier

# Load the Malayalam dataset
train_data = pd.read_csv("/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Malayalam/ML-AT-test.csv")

# Step 1: Preprocess Malayalam text
def preprocess_text(text):
    # Add more preprocessing steps as needed for Malayalam
    text = text.lower()  # Convert to lowercase
    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove punctuation
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract TF-IDF features from text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Train an MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=50, random_state=42)
mlp_model.fit(X_train_tfidf, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = mlp_model.predict(X_test_tfidf)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("MLP_prediction.tsv", sep='\t', index=False)
print("Predictions saved to MLP_prediction.tsv")


Class distribution: Class Label Short
N    406
C    186
P    118
R     91
G     82
Name: count, dtype: int64
Class weights: {'C': 0.9494623655913978, 'G': 2.153658536585366, 'N': 0.4349753694581281, 'P': 1.4966101694915255, 'R': 1.9406593406593406}
Predictions saved to MLP_prediction.tsv




In [11]:
import pandas as pd

# Load the TSV file
file_path = "/content/ML-AT-test.xlsx - Sheet1.tsv"
df = pd.read_csv(file_path, sep="\t")

# Change column name (e.g., from 'Old_Column_Name' to 'New_Column_Name')
df.rename(columns={"File Name": "File_Name"}, inplace=True)

# Save the updated DataFrame back to the TSV file
output_path = "/content/ML-AT-test.xlsx - Sheet1.tsv"
df.to_csv(output_path, sep="\t", index=False)

print(f"Updated file saved to {output_path}")


Updated file saved to /content/ML-AT-test.xlsx - Sheet1.tsv


**Fusion with weight**

In [13]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
logistic_predictions_path = "/content/Logistic_prediction.tsv"
mlp_predictions_path = "/content/MLP_prediction.tsv"
svm_predictions_path = "/content/CV_SVM_prediction.tsv"  # New file
output_fused_predictions_path ="/content/Text_fusion_weight.tsv"


# Load predictions
logistic_predictions = pd.read_csv(logistic_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(logistic_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(logistic_predictions['File_Name'] == svm_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion with different weights
fused_predictions = []
for _, (log_row, mlp_row, svm_row) in enumerate(zip(
    logistic_predictions.itertuples(),
    mlp_predictions.itertuples(),
    svm_predictions.itertuples()
)):
    File_Name = log_row.File_Name
    log_label = log_row.Predicted_Label
    mlp_label = mlp_row.Predicted_Label
    svm_label = svm_row.Predicted_Label

    # Weighted voting
    vote_counter = Counter()
    vote_counter[log_label] += 2 # Weight 1 for Logistic regression
    vote_counter[mlp_label] +=  3 # Weight 2 for MLP
    vote_counter[svm_label] += 1  # Weight 1 for additional model

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Text_fusion_weight.tsv.


**Fusion without weight**

In [16]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
logistic_predictions_path = "/content/Logistic_prediction.tsv"
mlp_predictions_path = "/content/MLP_prediction.tsv"
svm_predictions_path = "/content/CV_SVM_prediction.tsv"  # New file
output_fused_predictions_path = "/content/Text_fusion.tsv"

# Load predictions
logistic_predictions = pd.read_csv(logistic_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(logistic_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(logistic_predictions['File_Name'] == svm_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion without weighting
fused_predictions = []
for _, (log_row, mlp_row, svm_row) in enumerate(zip(
    logistic_predictions.itertuples(),
    mlp_predictions.itertuples(),
    svm_predictions.itertuples()
)):
    File_Name = log_row.File_Name
    log_label = log_row.Predicted_Label
    mlp_label = mlp_row.Predicted_Label
    svm_label = svm_row.Predicted_Label

    # Equal voting
    vote_counter = Counter([log_label, mlp_label, svm_label])

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Text_fusion.tsv.


#Audio

**Random Forest**

In [18]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Malayam/audio"
test_audio_dir = "//content/drive/MyDrive/Project/Test/Malayalam/audio"
output_tsv_path = "Random_audio_predictions.tsv"

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing Malayalam audio (MFCC feature extraction)
def preprocess_audio(file_path, sr=16000):
    try:
        y, original_sr = librosa.load(file_path, sr=None)
        # Resample if necessary
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        # Trim silence from audio
        y, _ = librosa.effects.trim(y)
        # Extract MFCC (Mel Frequency Cepstral Coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        print(f"MFCC shape for {file_path}: {mfccs.shape}")  # Debugging line
        return np.mean(mfccs, axis=1)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav")  # Add .wav extension
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 3: Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    print(f"Class weights: {class_weight_dict}")

    # Train Random Forest Classifier with class weights
    clf = RandomForestClassifier(class_weight=class_weight_dict)
    clf.fit(X_train, y_train_encoded)
    print("Model trained successfully.")

# Step 4: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]  # Adjust if needed
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        predicted_label_encoded = clf.predict([features])
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 5: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_001.wav: (13, 141)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_002.wav: (13, 252)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_003.wav: (13, 127)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_004.wav: (13, 189)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_005.wav: (13, 127)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_006.wav: (13, 252)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_007.wav: (13, 95)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_008.wav: (13, 127)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_009.wav: (13, 127)
MFCC shape for /content/drive/MyDrive/Project/Malayam/audio/H_ML_001_C_F_044_010.wav: (13, 252)
MFCC shape for /content/drive/MyDrive/Pro

**MLP classifier**

In [19]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Malayam/audio"  # Corrected path
test_audio_dir = "/content/drive/MyDrive/Project/Test/Malayalam/audio"  # Corrected path
output_tsv_path = "MLP_audio_predictions.tsv"  # Change path as needed

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing Malayalam audio (MFCC feature extraction)
def preprocess_audio(file_path, sr=16000):
    try:
        y, original_sr = librosa.load(file_path, sr=None)
        # Resample if necessary
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        # Trim silence from audio
        y, _ = librosa.effects.trim(y)
        # Extract MFCC (Mel Frequency Cepstral Coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        return np.mean(mfccs, axis=1)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Extract features and labels for training
X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav" )  # Add .wav extension
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 4: Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    print(f"Class weights: {class_weight_dict}")

    # Step 5: Train MLPClassifier
    clf = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42
    )
    clf.fit(X_train, y_train_encoded)
    print("MLPClassifier trained successfully.")

# Step 6: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        predicted_label_encoded = clf.predict([features])
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 7: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


Class weights: {0: 0.9494623655913978, 1: 2.153658536585366, 2: 0.4349753694581281, 3: 1.4966101694915255, 4: 1.9406593406593406}




MLPClassifier trained successfully.
Predictions saved to MLP_audio_predictions.tsv.


**SVM**

In [20]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Malayam/ML-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Malayam/audio"  # Corrected path
test_audio_dir = "/content/drive/MyDrive/Project/Test/Malayalam/audio"  # Corrected path
output_tsv_path = "SVM_audio_predictions.tsv"  # Change path as needed

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing Malayalam audio (MFCC feature extraction)
def preprocess_audio(file_path, sr=16000):
    try:
        y, original_sr = librosa.load(file_path, sr=None)
        # Resample if necessary
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        # Trim silence from audio
        y, _ = librosa.effects.trim(y)
        # Extract MFCC (Mel Frequency Cepstral Coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        return np.mean(mfccs, axis=1)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Extract features and labels for training
X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav")  # Add .wav extension
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 4: Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    print(f"Class weights: {class_weight_dict}")

    # Step 5: Train SVM Classifier
    clf = SVC(
        kernel='rbf',  # Radial Basis Function kernel
        C=1.0,  # Regularization parameter
        class_weight=class_weight_dict,
        probability=True  # Enable probability estimates
    )
    clf.fit(X_train, y_train_encoded)
    print("SVM trained successfully.")

# Step 6: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        predicted_label_encoded = clf.predict([features])
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 7: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


Class weights: {0: 0.9494623655913978, 1: 2.153658536585366, 2: 0.4349753694581281, 3: 1.4966101694915255, 4: 1.9406593406593406}
SVM trained successfully.
Predictions saved to SVM_audio_predictions.tsv.


In [23]:
import pandas as pd

# Load the TSV file
file_path = "/content/Random_audio_predictions.tsv"
df = pd.read_csv(file_path, sep="\t")

# Remove the '.wav' extension from the 'filename' column
df['File_Name'] = df['File_Name'].str.replace('.wav', '', regex=False)

# Save the updated DataFrame back to a new TSV file
output_path = "/content/Random_audio_predictions.tsv"
df.to_csv(output_path, sep="\t", index=False)

print(f"Updated file saved to {output_path}")


Updated file saved to /content/Random_audio_predictions.tsv


**Fusion with weight**

In [34]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
svm_predictions_path = "/content/SVM_audio_predictions.tsv"
mlp_predictions_path = "/content/MLP_audio_predictions.tsv"
rf_predictions_path = "/content/Random_audio_predictions.tsv"  # New file
output_fused_predictions_path ="/content/Audio_fusion_weight.tsv"


# Load predictions
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
rf_predictions = pd.read_csv(rf_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(svm_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(svm_predictions['File_Name'] == rf_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion with different weights
fused_predictions = []
for _, (svm_row, mlp_row, rf_row) in enumerate(zip(
    svm_predictions.itertuples(),
    mlp_predictions.itertuples(),
    rf_predictions.itertuples()
)):
    File_Name = svm_row.File_Name
    svm_label = svm_row.predicted_label
    mlp_label = mlp_row.predicted_label
    rf_label = rf_row.predicted_label  # Additional model prediction

    # Weighted voting
    vote_counter = Counter()
    vote_counter[svm_label] += 1
    vote_counter[mlp_label] += 3
    vote_counter[rf_label] += 2

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Audio_fusion_weight.tsv.


**Fusion without weight**

In [27]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
svm_predictions_path = "/content/SVM_audio_predictions.tsv"
mlp_predictions_path = "/content/MLP_audio_predictions.tsv"
rf_predictions_path = "/content/Random_audio_predictions.tsv"  # New file
output_fused_predictions_path = "/content/Fused_prediction_Audio.tsv"

# Load predictions
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
rf_predictions = pd.read_csv(rf_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(svm_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(svm_predictions['File_Name'] == rf_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion without weighting
fused_predictions = []
for _, (svm_row, mlp_row, rf_row) in enumerate(zip(
    svm_predictions.itertuples(),
    mlp_predictions.itertuples(),
    rf_predictions.itertuples()
)):
    File_Name = svm_row.File_Name
    svm_label = svm_row.predicted_label
    mlp_label = mlp_row.predicted_label
    rf_label = rf_row.predicted_label

    # Equal voting
    vote_counter = Counter([svm_label, mlp_label, rf_label])

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Fused_prediction_Audio.tsv.


In [36]:
# Convert filename column to uppercase
fused_predictions_df['File_Name'] = fused_predictions_df['File_Name'].str.upper()

# Save the updated file
output_fused_file = "/content/SSNCSE_Malayalam_Run1.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Updated fused predictions saved to {output_fused_file}.")


Updated fused predictions saved to /content/SSNCSE_Malayalam_Run1.tsv.


#Final (Text+Audio)

**Fusion with weight**

In [35]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
text_predictions_path = "/content/Text_fusion_weight.tsv"
audio_predictions_path = "/content/Audio_fusion_weight.tsv" # New file

# Load predictions
text_predictions = pd.read_csv(text_predictions_path, sep="\t")
audio_predictions = pd.read_csv(audio_predictions_path, sep="\t")


# Trim whitespace and ensure lowercase filenames for consistency
text_predictions['File_Name'] = text_predictions['File_Name'].str.strip().str.lower()
audio_predictions['File_Name'] = audio_predictions['File_Name'].str.strip().str.lower()

# Get unique filenames from both files
text_filenames = set(text_predictions['File_Name'])
audio_filenames = set(audio_predictions['File_Name'])

# Find mismatches
only_in_text = text_filenames - audio_filenames
only_in_audio = audio_filenames - text_filenames

# Print differences
print(f"Filenames only in text predictions: {only_in_text}")
print(f"Filenames only in audio predictions: {only_in_audio}")

# Keep only common filenames
common_filenames = text_filenames & audio_filenames

# Filter both DataFrames to keep only common filenames
text_predictions = text_predictions[text_predictions['File_Name'].isin(common_filenames)]
audio_predictions = audio_predictions[audio_predictions['File_Name'].isin(common_filenames)]

# Sort and reset index to align both files properly
text_predictions = text_predictions.sort_values(by='File_Name').reset_index(drop=True)
audio_predictions = audio_predictions.sort_values(by='File_Name').reset_index(drop=True)

# Define weights (adjust as needed)
weight_text = 1
weight_audio = 2

fused_predictions = []
for _, (text_row, audio_row) in enumerate(zip(text_predictions.itertuples(), audio_predictions.itertuples())):
    File_Name = text_row.File_Name
    text_label = text_row.fused_label
    audio_label = audio_row.fused_label

    # Weighted vote counting
    vote_counter = Counter()
    vote_counter[text_label] += weight_text
    vote_counter[audio_label] += weight_audio

    # Select the label with the highest weight
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save to TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
output_fused_file = "/content/SSNCSE_Malayalam_Run1.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Final fused predictions saved to {output_fused_file}.")




Filenames only in text predictions: set()
Filenames only in audio predictions: set()
Final fused predictions saved to /content/SSNCSE_Malayalam_Run1.tsv.


In [37]:
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the true labels from the Excel file (Sheet1)
true_labels_file = "/content/ML-AT-test.xlsx - Sheet1.tsv"
true_labels_data = pd.read_csv(true_labels_file, sep='\t')

# Load the predicted labels from the TSV file
predicted_labels_file = "/content/SSNCSE_MAlayalam_Run1.tsv"
predicted_data = pd.read_csv(predicted_labels_file, sep='\t')

# Ensure both datasets have the same 'File Name' column for alignment
# Remove leading/trailing spaces from all column names
true_labels_data.columns = true_labels_data.columns.str.strip()

# Now merge the datasets using the cleaned column names
merged_data = pd.merge(true_labels_data[['File_Name', 'Class Label']],
                       predicted_data[['File_Name', 'fused_label']],
                       on='File_Name')

# Extract true labels and predicted labels
y_true = merged_data['Class Label']
y_pred = merged_data['fused_label']

# Continue with the rest of your code...
# Encode the true and predicted labels (if needed)
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

# Generate the classification report
print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_))



              precision    recall  f1-score   support

           C       0.64      0.70      0.67        10
           G       0.55      0.60      0.57        10
           N       0.53      1.00      0.69        10
           P       0.50      0.30      0.38        10
           R       0.67      0.20      0.31        10

    accuracy                           0.56        50
   macro avg       0.57      0.56      0.52        50
weighted avg       0.57      0.56      0.52        50



**Fusion without weight**

In [30]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
text_predictions_path = "/content/Text_fusion.tsv"
audio_predictions_path = "/content/Fused_prediction_Audio.tsv"  # New file

# Load predictions
text_predictions = pd.read_csv(text_predictions_path, sep="\t")
audio_predictions = pd.read_csv(audio_predictions_path, sep="\t")

# Trim whitespace and ensure lowercase filenames for consistency
text_predictions['File_Name'] = text_predictions['File_Name'].str.strip().str.lower()
audio_predictions['File_Name'] = audio_predictions['File_Name'].str.strip().str.lower()

# Get unique filenames from both files
text_filenames = set(text_predictions['File_Name'])
audio_filenames = set(audio_predictions['File_Name'])

# Find mismatches
only_in_text = text_filenames - audio_filenames
only_in_audio = audio_filenames - text_filenames

# Print differences
print(f"Filenames only in text predictions: {only_in_text}")
print(f"Filenames only in audio predictions: {only_in_audio}")

# Keep only common filenames
common_filenames = text_filenames & audio_filenames

# Filter both DataFrames to keep only common filenames
text_predictions = text_predictions[text_predictions['File_Name'].isin(common_filenames)]
audio_predictions = audio_predictions[audio_predictions['File_Name'].isin(common_filenames)]

# Sort and reset index to align both files properly
text_predictions = text_predictions.sort_values(by='File_Name').reset_index(drop=True)
audio_predictions = audio_predictions.sort_values(by='File_Name').reset_index(drop=True)

# Majority fusion without weighting
fused_predictions = []
for _, (text_row, audio_row) in enumerate(zip(text_predictions.itertuples(), audio_predictions.itertuples())):
    File_Name = text_row.File_Name
    text_label = text_row.fused_label
    audio_label = audio_row.fused_label

    # Equal voting
    vote_counter = Counter([text_label, audio_label])

    # Select the label with the highest count
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save to TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
output_fused_file = "/content/SSNCSE_MAlayalam_Run2.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Final fused predictions saved to {output_fused_file}.")


Filenames only in text predictions: set()
Filenames only in audio predictions: set()
Final fused predictions saved to /content/SSNCSE_Malayalam_Run2.tsv.


In [33]:
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the true labels from the Excel file (Sheet1)
true_labels_file = "/content/ML-AT-test.xlsx - Sheet1.tsv"
true_labels_data = pd.read_csv(true_labels_file, sep='\t')

# Load the predicted labels from the TSV file
predicted_labels_file = "/content/SSNCSE_MAlayalam_Run2.tsv"
predicted_data = pd.read_csv(predicted_labels_file, sep='\t')

# Ensure both datasets have the same 'File Name' column for alignment
# Remove leading/trailing spaces from all column names
true_labels_data.columns = true_labels_data.columns.str.strip()

# Now merge the datasets using the cleaned column names
merged_data = pd.merge(true_labels_data[['File_Name', 'Class Label']],
                       predicted_data[['File_Name', 'fused_label']],
                       on='File_Name')

# Extract true labels and predicted labels
y_true = merged_data['Class Label']
y_pred = merged_data['fused_label']

# Continue with the rest of your code...
# Encode the true and predicted labels (if needed)
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

# Generate the classification report
print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_))



              precision    recall  f1-score   support

           C       0.32      1.00      0.49        10
           G       0.67      0.20      0.31        10
           N       0.73      0.80      0.76        10
           P       0.00      0.00      0.00        10
           R       0.25      0.10      0.14        10

    accuracy                           0.42        50
   macro avg       0.39      0.42      0.34        50
weighted avg       0.39      0.42      0.34        50

