In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Text

**Logistic** **Regression**

In [8]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Tamil Stopwords (expand as necessary)
tamil_stopwords = set([
    "அது", "இந்த", "இது", "உங்கள்", "அவள்", "ஆனால்", "என்ற", "இருந்து", "தான்", "எழுதி",
    "ஒரு", "உள்ள", "என்றும்", "உடன்", "அந்த", "என்பது", "இயற்கை", "வாங்க", "அல்லது", "இடையே"
])

# Load the Tamil dataset (adjust file paths as needed)
train_data = pd.read_csv("/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv")  # Training data with text and labels
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Tamil/TA-AT-test.csv")  # Test data with text only

# Step 1: Preprocess Tamil text
def preprocess_text_tamil(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, digits, and special characters, keeping only Tamil alphabets and spaces
    text = ''.join(char for char in text if '\u0B80' <= char <= '\u0BFF' or char.isspace())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in tamil_stopwords])
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text_tamil)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text_tamil)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract features using CountVectorizer
vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 6: Choose and train a machine learning model
# Logistic Regression
model = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)

# Train the model
model.fit(X_train_vec, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = model.predict(X_test_vec)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("CV_Logistic_prediction.tsv", sep='\t', index=False)
print("Predictions saved to CountVectorizer_Logistic_prediction.tsv")


Class distribution: Class Label Short
N    287
G     68
C     65
R     61
P     33
Name: count, dtype: int64
Class weights: {'C': 1.5815384615384616, 'G': 1.511764705882353, 'N': 0.3581881533101045, 'P': 3.1151515151515152, 'R': 1.6852459016393442}
Predictions saved to CountVectorizer_Logistic_prediction.tsv


**SVM**

In [13]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

# Tamil Stopwords (expand as necessary)
tamil_stopwords = set([
    "அது", "இந்த", "இது", "உங்கள்", "அவள்", "ஆனால்", "என்ற", "இருந்து", "தான்", "எழுதி",
    "ஒரு", "உள்ள", "என்றும்", "உடன்", "அந்த", "என்பது", "இயற்கை", "வாங்க", "அல்லது", "இடையே"
])

# Load the Tamil dataset (adjust file paths as needed)
train_data = pd.read_csv("/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv")  # Training data with text and labels
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Tamil/TA-AT-test.csv")  # Test data with text only

# Step 1: Preprocess Tamil text
def preprocess_text_tamil(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, digits, and special characters, keeping only Tamil alphabets and spaces
    text = ''.join(char for char in text if '\u0B80' <= char <= '\u0BFF' or char.isspace())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in tamil_stopwords])
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text_tamil)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text_tamil)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract TF-IDF features from Tamil text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Choose and train a machine learning model
# Support Vector Machine
model = SVC(kernel='sigmoid', class_weight='balanced', probability=True, random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = model.predict(X_test_tfidf)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("SVM_prediction.tsv", sep='\t', index=False)
print("Predictions saved to SVM_prediction.tsv")


Class distribution: Class Label Short
N    287
G     68
C     65
R     61
P     33
Name: count, dtype: int64
Class weights: {'C': 1.5815384615384616, 'G': 1.511764705882353, 'N': 0.3581881533101045, 'P': 3.1151515151515152, 'R': 1.6852459016393442}
Predictions saved to SVM_prediction.tsv


**MLP Classifier**

In [9]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier

# Tamil Stopwords (example - expand this list as necessary)
tamil_stopwords = set([
    "அது", "இந்த", "இது", "உங்கள்", "அவள்", "ஆனால்", "என்ற", "இருந்து", "தான்", "எழுதி",
    "ஒரு", "உள்ள", "என்றும்", "உடன்", "அந்த", "என்பது", "இயற்கை", "வாங்க", "அல்லது", "இடையே"
])

# Load the Tamil dataset (adjust the file paths as needed)
train_data = pd.read_csv("/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv")  # Training data with text and labels
test_data = pd.read_csv("/content/drive/MyDrive/Project/Test/Tamil/TA-AT-test.csv")  # Test data with text only

# Step 1: Preprocess Tamil text
def preprocess_text_tamil(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, digits, and special characters, keeping only Tamil alphabets and spaces
    text = ''.join(char for char in text if '\u0B80' <= char <= '\u0BFF' or char.isspace())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in tamil_stopwords])
    return text

train_data['Transcript'] = train_data['Transcript'].apply(preprocess_text_tamil)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_text_tamil)

# Step 2: Check class distribution in training data
class_counts = train_data['Class Label Short'].value_counts()
print("Class distribution:", class_counts)

# Step 3: Compute class weights for imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Class Label Short']), y=train_data['Class Label Short'])
class_weight_dict = dict(zip(np.unique(train_data['Class Label Short']), class_weights))
print("Class weights:", class_weight_dict)

# Step 4: Prepare the text data (transcripts) and labels
X_train = train_data['Transcript']
y_train = train_data['Class Label Short']
X_test = test_data['Transcript']

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)

# Step 5: Extract TF-IDF features from Tamil text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char', max_df=0.9, min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Train an MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=50, random_state=42)
mlp_model.fit(X_train_tfidf, y_train_numeric)

# Step 7: Predict class labels for test data
y_test_pred = mlp_model.predict(X_test_tfidf)

# Convert numeric predictions back to string labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Step 8: Save predictions in a TSV file with 'filename' and 'Predicted_Label' columns
test_data['Predicted_Label'] = y_test_pred_labels

# Assuming 'File Name' is a column in test_data (adjust if needed)
predictions = test_data[['File Name', 'Predicted_Label']]

# Save to TSV file
predictions.to_csv("MLP_Prediction.tsv", sep='\t', index=False)
print("Predictions saved to MLP_Prediction.tsv")


Class distribution: Class Label Short
N    287
G     68
C     65
R     61
P     33
Name: count, dtype: int64
Class weights: {'C': 1.5815384615384616, 'G': 1.511764705882353, 'N': 0.3581881533101045, 'P': 3.1151515151515152, 'R': 1.6852459016393442}
Predictions saved to MLP_Prediction.tsv




In [20]:
import pandas as pd

# Load the TSV file
file_path = "/content/SVM_prediction.tsv"
df = pd.read_csv(file_path, sep="\t")

# Change column name (e.g., from 'Old_Column_Name' to 'New_Column_Name')
df.rename(columns={"File Name": "File_Name"}, inplace=True)

# Save the updated DataFrame back to the TSV file
output_path = "/content/SVM_prediction.tsv"
df.to_csv(output_path, sep="\t", index=False)

print(f"Updated file saved to {output_path}")


Updated file saved to /content/SVM_prediction.tsv


**Fusion with weight**

In [65]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
logistic_predictions_path = "/content/CV_Logistic_prediction.tsv"
mlp_predictions_path = "/content/MLP_Prediction.tsv"
svm_predictions_path = "/content/SVM_prediction.tsv"  # New file
output_fused_predictions_path ="/content/Text_fusion_weight.tsv"


# Load predictions
logistic_predictions = pd.read_csv(logistic_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(logistic_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(logistic_predictions['File_Name'] == svm_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion with different weights
fused_predictions = []
for _, (log_row, mlp_row, svm_row) in enumerate(zip(
    logistic_predictions.itertuples(),
    mlp_predictions.itertuples(),
    svm_predictions.itertuples()
)):
    File_Name = log_row.File_Name
    log_label = log_row.Predicted_Label
    mlp_label = mlp_row.Predicted_Label
    svm_label = svm_row.Predicted_Label

    # Weighted voting
    vote_counter = Counter()
    vote_counter[log_label] += 1 # Weight 1 for Logistic regression
    vote_counter[mlp_label] +=  3 # Weight 2 for MLP
    vote_counter[svm_label] += 2  # Weight 1 for additional model

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Text_fusion_weight.tsv.


**Fusion without weight**

In [25]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
logistic_predictions_path = "/content/CV_Logistic_prediction.tsv"
mlp_predictions_path = "/content/MLP_Prediction.tsv"
svm_predictions_path = "/content/SVM_prediction.tsv"  # New file
output_fused_predictions_path = "/content/Text_fusion.tsv"

# Load predictions
logistic_predictions = pd.read_csv(logistic_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(logistic_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(logistic_predictions['File_Name'] == svm_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion without weighting
fused_predictions = []
for _, (log_row, mlp_row, svm_row) in enumerate(zip(
    logistic_predictions.itertuples(),
    mlp_predictions.itertuples(),
    svm_predictions.itertuples()
)):
    File_Name = log_row.File_Name
    log_label = log_row.Predicted_Label
    mlp_label = mlp_row.Predicted_Label
    svm_label = svm_row.Predicted_Label

    # Equal voting
    vote_counter = Counter([log_label, mlp_label, svm_label])

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Text_fusion.tsv.


# Audio

**MLP Classifierr**

In [28]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Tamil/audio"
test_audio_dir = "/content/drive/MyDrive/Project/Test/Tamil/audio"
output_tsv_path = "MLP_audio_predictions.tsv"

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing function for audio data
def preprocess_audio(file_path, sr=16000, n_mfcc=13):
    try:
        # Load the audio file
        y, original_sr = librosa.load(file_path, sr=None)
        # Resample the audio if needed
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        # Trim silence from the beginning and end
        y, _ = librosa.effects.trim(y)
        # Normalize amplitude
        y = librosa.util.normalize(y)
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        # Compute mean across time frames for fixed-length vector
        mfcc_mean = np.mean(mfccs, axis=1)
        return mfcc_mean
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Extract features and labels for training
X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav")  # Add .wav extension
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 4: Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    print(f"Class weights: {class_weight_dict}")

    # Step 5: Train MLPClassifier
    clf = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42
    )
    clf.fit(X_train, y_train_encoded)
    print("MLPClassifier trained successfully.")

# Step 6: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        predicted_label_encoded = clf.predict([features])
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 7: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_003.wav
Class weights: {0: 1.5661538461538462, 1: 1.615873015873016, 2: 0.35470383275261325, 3: 3.084848484848485, 4: 1.6688524590163933}




MLPClassifier trained successfully.
Predictions saved to MLP_audio_predictions.tsv.


**SVM**

In [29]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Tamil/audio"
test_audio_dir = "/content/drive/MyDrive/Project/Test/Tamil/audio"
output_tsv_path = "SVM_audio_predictions.tsv"

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing function for audio data
def preprocess_audio(file_path, sr=16000, n_mfcc=13):
    try:
        # Load the audio file
        y, original_sr = librosa.load(file_path, sr=None)
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        y, _ = librosa.effects.trim(y)
        y = librosa.util.normalize(y)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs, axis=1)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Extract features and labels for training
X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav")
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 4: Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    print(f"Class weights: {class_weight_dict}")

    # Step 5: Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # Train SVM Classifier
    clf = SVC(
        kernel='rbf',  # Radial Basis Function kernel
        C=1.0,         # Regularization parameter
        class_weight=class_weight_dict,
        probability=True,  # Enable probability estimates
        random_state=42
    )
    clf.fit(X_train, y_train_encoded)
    print("SVM classifier trained successfully.")

# Step 6: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        features = scaler.transform([features])  # Standardize test features
        predicted_label_encoded = clf.predict(features)
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 7: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_003.wav
Class weights: {0: 1.5661538461538462, 1: 1.615873015873016, 2: 0.35470383275261325, 3: 3.084848484848485, 4: 1.6688524590163933}
SVM classifier trained successfully.
Predictions saved to SVM_audio_predictions.tsv.


**Random Forest**

In [31]:
import pandas as pd
import librosa
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

# Paths (update with actual paths)
train_csv_path = "/content/drive/MyDrive/Project/Tamil/Text/TA-AT-train.csv"
train_audio_dir = "/content/drive/MyDrive/Project/Tamil/audio"
test_audio_dir = "/content/drive/MyDrive/Project/Test/Tamil/audio"
output_tsv_path = "RF_audio_predictions.tsv"

# Step 1: Load train.csv to map class labels
train_df = pd.read_csv(train_csv_path)
train_df['Class Label Short'] = train_df['Class Label Short'].astype(str)

# Step 2: Preprocessing function for audio data
def preprocess_audio(file_path, sr=16000, n_mfcc=13):
    try:
        # Load the audio file
        y, original_sr = librosa.load(file_path, sr=None)
        if original_sr != sr:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=sr)
        y, _ = librosa.effects.trim(y)
        y = librosa.util.normalize(y)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs, axis=1)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Extract features and labels for training
X_train, y_train = [], []
for _, row in train_df.iterrows():
    file_path = os.path.join(train_audio_dir, row['File Name'] + ".wav")
    if os.path.exists(file_path):
        features = preprocess_audio(file_path)
        if features is not None:
            X_train.append(features)
            y_train.append(row['Class Label Short'])
        else:
            print(f"Skipping file {file_path} due to feature extraction failure.")
    else:
        print(f"File not found: {file_path}")

X_train = np.array(X_train)

# Check if X_train is empty
if X_train.size == 0:
    print("No valid training data found. Please check file paths and data.")
else:
    # Proceed with model training
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Step 4: Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # Train Random Forest Classifier
    clf = RandomForestClassifier(
        n_estimators=200,  # Number of trees in the forest
        max_depth=None,  # No maximum depth (fully grown trees)
        min_samples_split=2,  # Minimum samples to split a node
        min_samples_leaf=1,  # Minimum samples in a leaf node
        random_state=42,
        n_jobs=-1  # Use all CPU cores for training
    )
    clf.fit(X_train, y_train_encoded)
    print("Random Forest classifier trained successfully.")

# Step 6: Predict for test data
test_predictions = []
test_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]
for file_name in test_files:
    file_path = os.path.join(test_audio_dir, file_name)
    features = preprocess_audio(file_path)
    if features is not None:
        features = scaler.transform([features])  # Standardize test features
        predicted_label_encoded = clf.predict(features)
        predicted_label = label_encoder.inverse_transform(predicted_label_encoded)[0]
        test_predictions.append({"File_Name": file_name, "predicted_label": predicted_label})
    else:
        print(f"Skipping file {file_path} due to feature extraction failure.")

# Step 7: Save predictions to a TSV file
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv(output_tsv_path, sep="\t", index=False)
print(f"Predictions saved to {output_tsv_path}.")


File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_037_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_001.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_002.wav
File not found: /content/drive/MyDrive/Project/Tamil/audio/H_TA_002_G_M_038_003.wav
Random Forest classifier trained successfully.
Predictions saved to RF_audio_predictions.tsv.


In [35]:
import pandas as pd

# Load the TSV file
file_path = "/content/RF_audio_predictions.tsv"
df = pd.read_csv(file_path, sep="\t")

# Remove the '.wav' extension from the 'filename' column
df['File_Name'] = df['File_Name'].str.replace('.wav', '', regex=False)

# Save the updated DataFrame back to a new TSV file
output_path = "/content/RF_audio_predictions.tsv"
df.to_csv(output_path, sep="\t", index=False)

print(f"Updated file saved to {output_path}")


Updated file saved to /content/RF_audio_predictions.tsv


**Fusion with weight**

In [44]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
svm_predictions_path = "/content/SVM_audio_predictions.tsv"
mlp_predictions_path = "/content/MLP_audio_predictions.tsv"
rf_predictions_path = "/content/RF_audio_predictions.tsv"  # New file
output_fused_predictions_path ="/content/Audio_fusion_weight.tsv"


# Load predictions
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
rf_predictions = pd.read_csv(rf_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(svm_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(svm_predictions['File_Name'] == rf_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion with different weights
fused_predictions = []
for _, (svm_row, mlp_row, rf_row) in enumerate(zip(
    svm_predictions.itertuples(),
    mlp_predictions.itertuples(),
    rf_predictions.itertuples()
)):
    File_Name = svm_row.File_Name
    svm_label = svm_row.predicted_label
    mlp_label = mlp_row.predicted_label
    rf_label = rf_row.predicted_label  # Additional model prediction

    # Weighted voting
    vote_counter = Counter()
    vote_counter[svm_label] += 3
    vote_counter[mlp_label] += 2
    vote_counter[rf_label] += 1

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Audio_fusion_weight.tsv.


**Fusion without weight**

In [42]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
svm_predictions_path = "/content/SVM_audio_predictions.tsv"
mlp_predictions_path = "/content/MLP_audio_predictions.tsv"
rf_predictions_path = "/content/RF_audio_predictions.tsv"  # New file
output_fused_predictions_path = "/content/Fused_prediction_Audio.tsv"

# Load predictions
svm_predictions = pd.read_csv(svm_predictions_path, sep="\t")
mlp_predictions = pd.read_csv(mlp_predictions_path, sep="\t")
rf_predictions = pd.read_csv(rf_predictions_path, sep="\t")  # Load additional predictions

# Ensure all files have the same filenames in the same order
if not (
    all(svm_predictions['File_Name'] == mlp_predictions['File_Name']) and
    all(svm_predictions['File_Name'] == rf_predictions['File_Name'])
):
    raise ValueError("Mismatch in filenames between the prediction files.")

# Majority fusion without weighting
fused_predictions = []
for _, (svm_row, mlp_row, rf_row) in enumerate(zip(
    svm_predictions.itertuples(),
    mlp_predictions.itertuples(),
    rf_predictions.itertuples()
)):
    File_Name = svm_row.File_Name
    svm_label = svm_row.predicted_label
    mlp_label = mlp_row.predicted_label
    rf_label = rf_row.predicted_label

    # Equal voting
    vote_counter = Counter([svm_label, mlp_label, rf_label])

    # Majority vote
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save fused predictions to a new TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
fused_predictions_df.to_csv(output_fused_predictions_path, sep="\t", index=False)
print(f"Fused predictions saved to {output_fused_predictions_path}.")


Fused predictions saved to /content/Fused_prediction_Audio.tsv.


In [71]:
# Convert filename column to uppercase
fused_predictions_df['File_Name'] = fused_predictions_df['File_Name'].str.upper()

# Save the updated file
output_fused_file = "/content/Final_Fused_weight_predictions.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Updated fused predictions saved to {output_fused_file}.")


Updated fused predictions saved to /content/Final_Fused_weight_predictions.tsv.


# Final (Text+Audio)

**Fusion with weight**

In [69]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
text_predictions_path = "/content/Text_fusion_weight.tsv"
audio_predictions_path = "/content/Audio_fusion_weight.tsv" # New file

# Load predictions
text_predictions = pd.read_csv(text_predictions_path, sep="\t")
audio_predictions = pd.read_csv(audio_predictions_path, sep="\t")


# Trim whitespace and ensure lowercase filenames for consistency
text_predictions['File_Name'] = text_predictions['File_Name'].str.strip().str.lower()
audio_predictions['File_Name'] = audio_predictions['File_Name'].str.strip().str.lower()

# Get unique filenames from both files
text_filenames = set(text_predictions['File_Name'])
audio_filenames = set(audio_predictions['File_Name'])

# Find mismatches
only_in_text = text_filenames - audio_filenames
only_in_audio = audio_filenames - text_filenames

# Print differences
print(f"Filenames only in text predictions: {only_in_text}")
print(f"Filenames only in audio predictions: {only_in_audio}")

# Keep only common filenames
common_filenames = text_filenames & audio_filenames

# Filter both DataFrames to keep only common filenames
text_predictions = text_predictions[text_predictions['File_Name'].isin(common_filenames)]
audio_predictions = audio_predictions[audio_predictions['File_Name'].isin(common_filenames)]

# Sort and reset index to align both files properly
text_predictions = text_predictions.sort_values(by='File_Name').reset_index(drop=True)
audio_predictions = audio_predictions.sort_values(by='File_Name').reset_index(drop=True)

# Define weights (adjust as needed)
weight_text = 1
weight_audio = 2

fused_predictions = []
for _, (text_row, audio_row) in enumerate(zip(text_predictions.itertuples(), audio_predictions.itertuples())):
    File_Name = text_row.File_Name
    text_label = text_row.fused_label
    audio_label = audio_row.fused_label

    # Weighted vote counting
    vote_counter = Counter()
    vote_counter[text_label] += weight_text
    vote_counter[audio_label] += weight_audio

    # Select the label with the highest weight
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save to TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
output_fused_file = "/content/SSNCSE_Tamil_Run1.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Final fused predictions saved to {output_fused_file}.")




Filenames only in text predictions: set()
Filenames only in audio predictions: set()
Final fused predictions saved to /content/SSNCSE_Tamil_Run1.tsv.


In [72]:
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the true labels from the Excel file (Sheet1)
true_labels_file = "/content/TA-AT-test.xlsx - Sheet1.tsv"
true_labels_data = pd.read_csv(true_labels_file, sep='\t')

# Load the predicted labels from the TSV file
predicted_labels_file = "/content/SSNCSE_Tamil_Run1.tsv"
predicted_data = pd.read_csv(predicted_labels_file, sep='\t')

# Ensure both datasets have the same 'File Name' column for alignment
# Remove leading/trailing spaces from all column names
true_labels_data.columns = true_labels_data.columns.str.strip()

# Now merge the datasets using the cleaned column names
merged_data = pd.merge(true_labels_data[['File_Name', 'Class Label']],
                       predicted_data[['File_Name', 'fused_label']],
                       on='File_Name')

# Extract true labels and predicted labels
y_true = merged_data['Class Label']
y_pred = merged_data['fused_label']

# Continue with the rest of your code...
# Encode the true and predicted labels (if needed)
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

# Generate the classification report
print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_))



              precision    recall  f1-score   support

           C       0.33      0.30      0.32        10
           G       0.50      0.80      0.62        10
           N       0.80      0.40      0.53        10
           P       0.89      0.80      0.84        10
           R       0.64      0.70      0.67        10

    accuracy                           0.60        50
   macro avg       0.63      0.60      0.59        50
weighted avg       0.63      0.60      0.59        50



**Fusion without weight**

In [54]:
import pandas as pd
from collections import Counter

# File paths (update if necessary)
text_predictions_path = "/content/Text_fusion.tsv"
audio_predictions_path = "/content/Fused_prediction_Audio.tsv"  # New file

# Load predictions
text_predictions = pd.read_csv(text_predictions_path, sep="\t")
audio_predictions = pd.read_csv(audio_predictions_path, sep="\t")

# Trim whitespace and ensure lowercase filenames for consistency
text_predictions['File_Name'] = text_predictions['File_Name'].str.strip().str.lower()
audio_predictions['File_Name'] = audio_predictions['File_Name'].str.strip().str.lower()

# Get unique filenames from both files
text_filenames = set(text_predictions['File_Name'])
audio_filenames = set(audio_predictions['File_Name'])

# Find mismatches
only_in_text = text_filenames - audio_filenames
only_in_audio = audio_filenames - text_filenames

# Print differences
print(f"Filenames only in text predictions: {only_in_text}")
print(f"Filenames only in audio predictions: {only_in_audio}")

# Keep only common filenames
common_filenames = text_filenames & audio_filenames

# Filter both DataFrames to keep only common filenames
text_predictions = text_predictions[text_predictions['File_Name'].isin(common_filenames)]
audio_predictions = audio_predictions[audio_predictions['File_Name'].isin(common_filenames)]

# Sort and reset index to align both files properly
text_predictions = text_predictions.sort_values(by='File_Name').reset_index(drop=True)
audio_predictions = audio_predictions.sort_values(by='File_Name').reset_index(drop=True)

# Majority fusion without weighting
fused_predictions = []
for _, (text_row, audio_row) in enumerate(zip(text_predictions.itertuples(), audio_predictions.itertuples())):
    File_Name = text_row.File_Name
    text_label = text_row.fused_label
    audio_label = audio_row.fused_label

    # Equal voting
    vote_counter = Counter([text_label, audio_label])

    # Select the label with the highest count
    fused_label = vote_counter.most_common(1)[0][0]
    fused_predictions.append({"File_Name": File_Name, "fused_label": fused_label})

# Save to TSV file
fused_predictions_df = pd.DataFrame(fused_predictions)
output_fused_file = "/content/SSNCSE_Tamil_Run2.tsv"
fused_predictions_df.to_csv(output_fused_file, sep="\t", index=False)

print(f"Final fused predictions saved to {output_fused_file}.")


Filenames only in text predictions: set()
Filenames only in audio predictions: set()
Final fused predictions saved to /content/SSNCSE_Tamil_Run2.tsv.


In [68]:
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the true labels from the Excel file (Sheet1)
true_labels_file = "/content/TA-AT-test.xlsx - Sheet1.tsv"
true_labels_data = pd.read_csv(true_labels_file, sep='\t')

# Load the predicted labels from the TSV file
predicted_labels_file = "/content/SSNCSE_Tamil_Run1.tsv"
predicted_data = pd.read_csv(predicted_labels_file, sep='\t')

# Ensure both datasets have the same 'File Name' column for alignment
# Remove leading/trailing spaces from all column names
true_labels_data.columns = true_labels_data.columns.str.strip()

# Now merge the datasets using the cleaned column names
merged_data = pd.merge(true_labels_data[['File_Name', 'Class Label']],
                       predicted_data[['File_Name', 'fused_label']],
                       on='File_Name')

# Extract true labels and predicted labels
y_true = merged_data['Class Label']
y_pred = merged_data['fused_label']

# Continue with the rest of your code...
# Encode the true and predicted labels (if needed)
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

# Generate the classification report
print(classification_report(y_true_encoded, y_pred_encoded, target_names=label_encoder.classes_))



              precision    recall  f1-score   support

           C       0.38      0.60      0.46        10
           G       0.70      0.70      0.70        10
           N       0.40      0.60      0.48        10
           P       0.00      0.00      0.00        10
           R       0.86      0.60      0.71        10

    accuracy                           0.50        50
   macro avg       0.47      0.50      0.47        50
weighted avg       0.47      0.50      0.47        50

