In [None]:
%cd -your path-

# **Importing All Libraries**

In [None]:
import librosa, os, soundfile, numpy as np, pandas as pd, matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
import glob, os
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import OneClassSVM
from collections import Counter
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import scipy.stats as stats

# **Selecting Data with Similar Features**

In [None]:
import glob
import numpy as np

totalLangData = [] #variable for all the 10 languages features
totalLabelData = [] #variable for all the labels

# Get all .npy files
npy_files = glob.glob("*.npy")

# Select the files you want to process using a list of indices
indices = [1,2,6,9,10,12,13,14,15,17]
for index in indices:
    if index < len(npy_files):
        filename = npy_files[index]
        oneLangData = np.load(filename, allow_pickle=True)
        langTrainData = oneLangData[0:4999, 4]
        totalLangData += [x.flatten().tolist() for x in langTrainData]
        label = oneLangData[0:4999, 1].tolist()
        totalLabelData += label

#convert both lists to numpy array to feed to the classifier
totalLangDataAsArray = np.asarray(totalLangData)
totalLabelDataAsArray = np.asarray(totalLabelData)

#data scaling
scaler = StandardScaler()
# keep our unscaled features just in case we need to process them alternatively
features_scaled = totalLangDataAsArray
features_scaled = scaler.fit_transform(features_scaled)

#dimensionality reduction
pca = PCA(n_components=190)
pca.fit(features_scaled)
components = pca.fit_transform(features_scaled)

total_var = pca.explained_variance_ratio_.sum() * 100
print(total_var)


#data splitting
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    components,
    totalLabelDataAsArray,
    test_size=0.3,
    random_state=69
)

99.02100749558126


### **Plotting Data**

In [None]:
df = pd.DataFrame(totalLangDataAsArray)


#dimensionality reduction by PCA
pca = PCA(n_components=2)
pca.fit(features_scaled)
components = pca.fit_transform(features_scaled)

#Explained Variance Ratio
total_var = pca.explained_variance_ratio_.sum() * 100

#plotting
fig = px.scatter(components, x=0, y=1,title=f'Total Explained Variance: {total_var:.2f}%', color=totalLabelDataAsArray, labels={'0': 'PCA Component 1', '1': 'PCA Component 2'})
fig.update_layout(legend_title_text='Language')
fig.show()

In [None]:
#3D plotting
df = pd.DataFrame(totalLangDataAsArray)
pca = PCA(n_components=3)
components = pca.fit_transform(df)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=totalLabelDataAsArray,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PCA Component 1', '1': 'PCA Component 2', '2': 'PCA Component 3'}
)
fig.update_layout(legend_title_text='Language')
fig.show()

## **Training One Class SVM**

In [None]:
#finding optimal parameters
param_dist = {
    'kernel': ['rbf'],
    'nu': stats.uniform(0.01, 0.99),  # Uniform distribution between 0.1 and 0.5
    'gamma': ['auto']  # Log-uniform distribution for gamma
}
model = OneClassSVM()
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_scaled, y_train_scaled)
best_params = random_search.best_params_
best_score = random_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_clf = random_search.best_estimator_

# Predict outliers on the test set
y_pred = best_clf.predict(X_train_scaled)

# Count outliers (-1 indicates outliers)
outliers = (y_pred == -1).sum()
print(f"Number of outliers detected: {outliers}")

In [None]:
X = X_train_scaled
clf = OneClassSVM(nu=0.01, kernel ='rbf', gamma='auto').fit(X)

outliers_train = clf.predict(X)
outliers = np.count_nonzero(outliers_train==-1)
inliers = np.count_nonzero(outliers_train==1)
print("Training Data")
print("Outliers:",outliers)
print( "Inliers:",inliers)

#src = (clf.score_samples(X))
#finding labels
# The following line is changed to index into X instead of outliers_train
outlier_indices = np.where(clf.predict(X) == -1)[0]
outlier_labels = y_train_scaled[outlier_indices]
#print(f"Outlier Labels: {outlier_labels}")
label_counts = Counter(outlier_labels)
df = pd.DataFrame.from_dict(label_counts, orient='index', columns=['Count'])
print(df)


#testing
outliers_test = clf.predict(X_test_scaled)
outliers = np.count_nonzero(outliers_test==-1)
inliers = np.count_nonzero(outliers_test==1)
print("\n\n")
print("Testing Data")
print("Outliers:",outliers)
print( "Inliers:",inliers)

#src = (clf.score_samples(X))
#finding labels
# The following line is changed to index into X_test_scaled instead of outliers_test
outlier_indices = np.where(clf.predict(X_test_scaled) == -1)[0]
outlier_labels = y_test_scaled[outlier_indices]
#print(f"Outlier Labels: {outlier_labels}")
label_counts = Counter(outlier_labels)
df = pd.DataFrame.from_dict(label_counts, orient='index', columns=['Count'])
print(df)

Training Data
Outliers: 404
Inliers: 34589
             Count
ur-Urdu         41
sd-Sindhi       42
te-Telugu       52
ta-Tamil        22
pa-Panjabi      59
si-Sinhala      46
bn-Bengali      38
hi-Hindi        43
ne-Nepali       18
as-Assamese     43



Testing Data
Outliers: 258
Inliers: 14739
             Count
bn-Bengali      29
pa-Panjabi      38
sd-Sindhi       29
te-Telugu       32
ta-Tamil        26
ur-Urdu         33
as-Assamese     24
si-Sinhala      11
hi-Hindi        23
ne-Nepali       13


In [None]:
#pca reduced features = components
pca = PCA(n_components=2)
pca.fit(X_train_scaled)
components = pca.fit_transform(X_train_scaled)
X = components
clf = OneClassSVM(nu=0.01, kernel ='rbf', gamma='auto').fit(X)

outliers_pca = clf.predict(X)
outliers = np.count_nonzero(outliers_pca==-1)
inliers = np.count_nonzero(outliers_pca==1)
print("PCA Training Data")
print("Outliers:",outliers)
print( "Inliers:",inliers)


PCA Training Data
Outliers: 9304
Inliers: 25689


### **Plotting Outliers**

In [None]:
# Separate inliers and outliers for plotting
inlier_indices = np.where(outliers_pca == 1)[0]
outlier_indices = np.where(outliers_pca == -1)[0]

# Count inliers and outliers
num_inliers = len(inlier_indices)
num_outliers = len(outlier_indices)


# Create DataFrame for plotting
df_plot = pd.DataFrame(components, columns=['PCA1', 'PCA2'])
df_plot['Inliers'] = outliers_pca == 1  # Mark outliers as False

# Plot inliers and outliers
fig = px.scatter(df_plot, x='PCA1', y='PCA2', color='Inliers',
                 title='One-Class SVM Outlier Detection',
                 labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2'})

# Add legend with counts
fig.update_layout(legend_title_text=f'Inliers: {num_inliers}, Outliers: {num_outliers} (Red)')
fig.show()