# Model bulit on colab is trained only on subset of 100 data samples

In [None]:
import numpy as np
import glob , csv , codecs , copy
from tabulate import tabulate
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
lang_list=['asm','ben','eng','guj','hin','kan', 'mal', 'mar', 'odi', 'pun', 'tam', 'tel']

## Get Feature List

In [None]:
def get_feature(fpath,lang):
  path = r'/content/drive/MyDrive/PRA2_data/extracted/'+lang+'/'+ fpath 
  # print(path)
  all_files = glob.glob(path + "/*.csv")
#   print(all_files)
  mfcc_feature = np.empty([0,39])
  count = 0
  for filename in all_files:
    if count < 100: # training using subset of 100 files per langauage
      vectors = list(csv.reader(codecs.open(filename, 'rU', 'utf-16')))
      mfcc_feature=np.concatenate((mfcc_feature,vectors),axis=0)
      count+=1
    else:
      break
  # print("Training Feature Collection for " + lang+ ": " ,mfcc_feature.shape)
  return mfcc_feature

In [None]:
def get_ubm_features(fpath):
  for lang in lang_list:
    path = r'/content/drive/MyDrive/PRA2_data/extracted/'+lang+'/'+ fpath
    all_files = glob.glob(path + "/*.csv")
      #   print(all_files)
    mfcc_ubm_feature = np.empty([0,39])
    count = 0
    for filename in all_files:
      if count < 100: # training using subset of 100 files per langauage
        vectors = list(csv.reader(codecs.open(filename, 'rU', 'utf-16')))
        mfcc_ubm_feature=np.concatenate((mfcc_ubm_feature,vectors),axis=0)
        count+=1
      else:
        break
    print("Training Feature Collection for " + lang+ ": " ,mfcc_ubm_feature.shape)
  return mfcc_ubm_feature



In [None]:
# get_ubm_features('PB_train')

# MAP Adaption of Mean

In [None]:

def map_adapt(ubm, X, max_iter=100, r=0.7):
    
    gmm = copy.deepcopy(ubm)
    # X=np,array(X,dtype=float)
    for _ in range(max_iter):
        n = np.sum(gmm.predict_proba(X), axis=0).reshape(-1, 1) # (K, 1)
        X_tilde = (1 / n) * gmm.predict_proba(X).T.dot(X) # (K, F)
        alpha = (n / (n + r)).reshape(-1, 1) # (K, 1)
        gmm.means_ = alpha * X_tilde + (1 - alpha) * gmm.means_
    
    return gmm

In [None]:
def get_models(ubm):   
    
    # To be a list of labeled models.
    model_list = []
    for lang in lang_list:
        
        features = np.array(get_feature('PB_train',lang),dtype=float)
        
        # Create and train GMM using MAP-adaptation.
        gmm = map_adapt(ubm, features)
        print("Model Adapted for ",lang)
        # Add generated model to the list.
        
        model_list.append(gmm)

    return model_list

## Training Model

In [None]:

features_ubm = np.array(get_ubm_features('PB_train'),dtype=float)

ubm = GaussianMixture(n_components=256,
                      covariance_type='diag',
                      max_iter=100, init_params='kmeans')

ubm.fit(features_ubm)

# n=np.sum(ubm.predict_proba(features_ubm)).reshape(-1,1)
# X_tilde = (1 / n) * ubm.predict_proba(features_ubm).T.dot(features_ubm)

models = get_models(ubm)

Training Feature Collection for asm:  (11701, 39)
Training Feature Collection for ben:  (23882, 39)
Training Feature Collection for eng:  (35013, 39)
Training Feature Collection for guj:  (25756, 39)
Training Feature Collection for hin:  (23973, 39)
Training Feature Collection for kan:  (21907, 39)
Training Feature Collection for mal:  (19668, 39)
Training Feature Collection for mar:  (33091, 39)
Training Feature Collection for odi:  (22159, 39)
Training Feature Collection for pun:  (37659, 39)
Training Feature Collection for tam:  (32626, 39)
Training Feature Collection for tel:  (24419, 39)
Model Adapted for  asm
Model Adapted for  ben
Model Adapted for  eng
Model Adapted for  guj
Model Adapted for  hin
Model Adapted for  kan
Model Adapted for  mal
Model Adapted for  mar
Model Adapted for  odi
Model Adapted for  pun
Model Adapted for  tam
Model Adapted for  tel


In [None]:
print(models)

[GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256), GaussianMixture(covariance_type='diag', n_components=256)]


## Testing Model

In [None]:

def testmodel(fpath,lang):
  path = r'/content/drive/MyDrive/PRA2_data/extracted/'+lang+'/'+ fpath  
  all_files = glob.glob(path + "/*.csv")  
  pred_correct=0
  lang_pred=[0] * len(lang_list)
  count = 0
  for filename in all_files:
    if count<100: 
    
      mfcc_test_feature = np.empty([0,39])
      mfccs = list(csv.reader(codecs.open(filename, 'rU', 'utf-16')))
      mfcc_test_feature=np.concatenate((mfcc_test_feature,mfccs),axis=0)      
      # print(mfcc_test_feature.shape)
      count+=1
      log_likelihood = np.zeros(len(models))
      # print(log_likelihood)
      
     
      for i in range(len(models)):
        gmm    = models[i]         #checking with each model one by one
        scores = np.array(gmm.score_samples(mfcc_test_feature))        
        log_likelihood[i] = scores.sum()

      # print(log_likelihood)
      l_index = np.argmax(log_likelihood)
      lang_pred[l_index]+=1
      if lang_list[l_index]==lang:
        pred_correct+=1
      # print ("############################  Predicted Language - ", lang_list[l_index])
      # print ("############################  Actual Language - ", lang)        
      
    else:
      break 
  

  return pred_correct,count,lang_pred

## Result for Prasar Bharti Data


In [None]:
result_pb=[]
confusion_mat_pb=[]
for l in lang_list:
  p,t,lang_pred=testmodel('PB_test',l)
  res=[l,p,t]
  confusion_mat_pb.append(lang_pred)
  result_pb.append(res)
print("System 2 GMM_UBM Prasar Bharti Sample test data:")  
head = ["Language","Predicted Correctly","Total_Samples"]
print(tabulate(result_pb, headers=head, tablefmt="grid"))
# confusion_mat_pb
# print(result_pb)

System 2 GMM_UBM Prasar Bharti Sample test data:
+------------+-----------------------+-----------------+
| Language   |   Predicted Correctly |   Total_Samples |
| asm        |                    75 |             100 |
+------------+-----------------------+-----------------+
| ben        |                    97 |             100 |
+------------+-----------------------+-----------------+
| eng        |                    88 |             100 |
+------------+-----------------------+-----------------+
| guj        |                    91 |             100 |
+------------+-----------------------+-----------------+
| hin        |                    75 |             100 |
+------------+-----------------------+-----------------+
| kan        |                    79 |             100 |
+------------+-----------------------+-----------------+
| mal        |                    77 |             100 |
+------------+-----------------------+-----------------+
| mar        |                    65 | 

 ## Accuracy and Confusion Matrix for Prasar Bharti Data

In [None]:
pred_correct=0
total_samples=0
for p in result_pb:
  pred_correct+=p[1]
  total_samples+=p[2]

acc_pb= (pred_correct *100)/ total_samples
print("Accuracy for Prasar Bharti Data Samples: ",acc_pb , " %")
print("******************************************************************")
print("Confusion Matrix for Prasar Bharti Data: ")
confusion_mat_pb


Accuracy for Prasar Bharti Data Samples:  80.0  %
******************************************************************
Confusion Matrix for Prasar Bharti Data: 


[[75, 1, 10, 1, 0, 0, 1, 1, 8, 0, 1, 2],
 [1, 97, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [2, 1, 88, 0, 3, 0, 2, 0, 2, 0, 1, 1],
 [1, 0, 0, 91, 0, 0, 0, 1, 0, 1, 3, 3],
 [0, 1, 10, 7, 75, 0, 2, 3, 0, 2, 0, 0],
 [2, 0, 5, 0, 0, 79, 0, 1, 3, 1, 3, 6],
 [3, 2, 1, 2, 1, 0, 77, 1, 10, 0, 2, 1],
 [21, 0, 0, 0, 14, 0, 0, 65, 0, 0, 0, 0],
 [3, 0, 2, 0, 3, 0, 0, 0, 91, 0, 0, 1],
 [0, 0, 0, 46, 0, 0, 0, 0, 0, 54, 0, 0],
 [1, 5, 0, 11, 0, 0, 0, 1, 0, 0, 82, 0],
 [0, 1, 3, 5, 2, 0, 0, 0, 2, 0, 1, 86]]

## Result for Youtube Data

In [None]:
result_yt=[]
confusion_mat_yt=[]
for l in lang_list:
  p,t,lang_pred=testmodel('YT_test',l)
  res=[l,p,t]
  confusion_mat_yt.append(lang_pred)
  result_yt.append(res)
print("System 2 GMM_UBM Youtube Sample test data:")
head = ["Language","Predicted Correctly","Total_Samples"]
print(tabulate(result_yt, headers=head, tablefmt="grid"))
# confusion_mat_yt
# print(result_yt)

System 2 GMM_UBM Prasar Bharti Sample test data:
+------------+-----------------------+-----------------+
| Language   |   Predicted Correctly |   Total_Samples |
| asm        |                    24 |             100 |
+------------+-----------------------+-----------------+
| ben        |                    10 |             100 |
+------------+-----------------------+-----------------+
| eng        |                    17 |             100 |
+------------+-----------------------+-----------------+
| guj        |                     1 |             100 |
+------------+-----------------------+-----------------+
| hin        |                     1 |             100 |
+------------+-----------------------+-----------------+
| kan        |                     5 |             100 |
+------------+-----------------------+-----------------+
| mal        |                     4 |             100 |
+------------+-----------------------+-----------------+
| mar        |                    25 | 

 ## Accuracy and Confusion Matrix for Youtube Data

In [None]:
pred_correct=0
total_samples=0
for p in result_yt:
  pred_correct+=p[1]
  total_samples+=p[2]

acc_yt= (pred_correct *100)/ total_samples
print("Accuracy for Youtube Data Samples: ",acc_yt , " %")
print("******************************************************************")
print("Confusion Matrix for Youtube Data: ")
confusion_mat_yt

Accuracy for Youtube Data Samples:  9.0  %
******************************************************************
Confusion Matrix for Youtube Data: 


[[24, 21, 12, 0, 0, 0, 4, 1, 35, 0, 0, 3],
 [8, 10, 18, 0, 0, 3, 28, 15, 8, 0, 0, 10],
 [1, 7, 17, 0, 0, 9, 27, 4, 13, 0, 20, 2],
 [0, 0, 9, 1, 0, 17, 54, 4, 0, 0, 8, 7],
 [0, 22, 0, 1, 1, 8, 34, 4, 19, 0, 2, 9],
 [13, 15, 13, 0, 0, 5, 14, 22, 12, 0, 5, 1],
 [6, 21, 0, 12, 3, 6, 4, 21, 7, 6, 10, 4],
 [18, 0, 2, 0, 0, 0, 16, 25, 12, 0, 0, 27],
 [22, 12, 16, 0, 0, 0, 5, 36, 9, 0, 0, 0],
 [4, 2, 0, 0, 0, 3, 4, 28, 58, 0, 1, 0],
 [7, 12, 3, 1, 24, 0, 5, 8, 25, 4, 11, 0],
 [1, 7, 0, 16, 2, 2, 22, 16, 18, 0, 15, 1]]