# Loading Models 

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torchaudio
from sklearn.model_selection import train_test_split
import os
import sys

## INFO ABOUT DATASET  
* ``` source of the data ```

In [2]:
info_df = pd.read_csv('data/DementiaNet.csv' , header = 'infer')
info_df

Unnamed: 0,name,dementia type,birth,death,first symptoms,URLs after symptoms,5 years,5 < 10 years,10 < 15 years,gender,ethnicity,datasplit,language,unknown 1,unkown 2,unknown 3
0,Abe Burrows,Alzheimer,1910,1985,1975.0,,https://www.youtube.com/watch?v=VezbsmCriw4,,,male,Caucasian/White,train,,,,
1,Aileen Hernandez,Dementia,1926,2017,2012.0,https://youtu.be/x7hujcEhQuY,https://youtu.be/CshhDl-YwkY \nhttps://youtu.b...,,,female,Black/African American,train,,,,
2,Alan Ramsey,Dementia,1938,2020,2015.0,,https://www.youtube.com/watch?v=CHeXE4c6EDI,,,male,Caucasian/White,train,,,,
3,Allan Burns,Lewy body,1935,2021,,,https://www.youtube.com/watch?v=aD3hL-kWoPc,,,male,Caucasian/White,train,,,,
4,Andrew Sachs,Dementia,1930,2016,2012.0,,,https://youtu.be/FSgKLooW1LM,https://youtu.be/3V1iFmavqG4,male,,train,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Trevor Peacock,Alzheimer,1931,2021,2016.0,,https://www.youtube.com/watch?v=ktgeU7TkltA,,,male,Caucasian/White,train,,,,
80,Unita Blackwell,Dementia,1933,2019,2008.0,,,,https://youtu.be/gkQCvBBYkfY,female,Black/African American,train,,,,
81,Vampiro,Alzheimer,1967,present,2019.0,https://www.youtube.com/watch?v=Z_Mwg_Tw0rQ,"https://www.youtube.com/watch?v=PnjSYL6tihs ,h...",,,male,Caucasian/White,test,,,,
82,Viv Nicholson,Dementia,1936,2015,2010.0,,https://www.youtube.com/watch?v=254FRMcTHyU,,,female,Caucasian/White,valid,,,,


In [3]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 84 non-null     object 
 1   dementia type        84 non-null     object 
 2   birth                84 non-null     int64  
 3   death                82 non-null     object 
 4   first symptoms       79 non-null     float64
 5   URLs after symptoms  29 non-null     object 
 6   5 years              39 non-null     object 
 7   5 < 10 years         36 non-null     object 
 8   10 < 15 years        26 non-null     object 
 9   gender               83 non-null     object 
 10  ethnicity            83 non-null     object 
 11  datasplit            84 non-null     object 
 12  language             0 non-null      float64
 13  unknown 1            2 non-null      object 
 14  unkown 2             0 non-null      float64
 15  unknown 3            0 non-null      float

###  Collected dataset stred with their path 

In [4]:
train_df = pd.read_csv('data/train_dm.csv' , header='infer')
train_df.head(5)

Unnamed: 0,file,label,path
0,daningram_15,dementia,data/dementia/Dan Ingram/daningram_15.wav
1,terryjones_5,dementia,data/dementia/Terry Jones/terryjones_5.wav
2,maureenforrester_5,dementia,data/dementia/Maureen Forrester/maureenforrest...
3,aileenhernandez_0,dementia,data/dementia/Aileen Hernandez/aileenhernandez...
4,aileenhernandez_5_1,dementia,data/dementia/Aileen Hernandez/aileenhernandez...


In [5]:
valid_df = pd.read_csv('data/valid_dm.csv' , header='infer')
valid_df.head(5)

Unnamed: 0,file,label,path
0,JimmyCalderwood_5,dementia,data/dementia/Jimmy Calderwood/JimmyCalderwood...
1,vivnicholson_5,dementia,data/dementia/Viv Nicholson/vivnicholson_5.wav
2,IanHolm_2,dementia,data/dementia/Ian Holm/IanHolm_2.wav
3,CharmianCarr_15,dementia,data/dementia/Charmian Carr/CharmianCarr_15.wav
4,CharmianCarr_5,dementia,data/dementia/Charmian Carr/CharmianCarr_5.wav


# Class distriburion of the dataset 
*  class1 : ```dementia...```
*  class0 : ```no-dementia```

In [6]:
train_df.groupby('label').count()[['path']]

Unnamed: 0_level_0,path
label,Unnamed: 1_level_1
dementia,106
nodementia,121


In [7]:
valid_df.groupby('label').count()[['path']]

Unnamed: 0_level_0,path
label,Unnamed: 1_level_1
dementia,20
nodementia,28


# USING DATASET

In [8]:

file_path = train_df['path'][0]
# the csv is used to extract info and attach label to file
if os.path.isfile(file_path):
    file_info = os.stat(file_path)
    print(f"Size: {file_info.st_size} bytes")
    print(f"Last modified: {file_info.st_mtime}")
    print(f"Last accessed: {file_info.st_atime}")
    print(f"Created: {file_info.st_ctime}")
    print(f"Mode: {file_info.st_mode}")
    print(f"Owner User ID: {file_info.st_uid}")
    print(f"Owner Group ID: {file_info.st_gid}")
    file_name = os.path.basename(file_path)
    print(file_name)  # Output: file.txt

Size: 8114478 bytes
Last modified: 1725041597.110077
Last accessed: 1725048187.5094182
Created: 315513000.0
Mode: 33206
Owner User ID: 0
Owner Group ID: 0
daningram_15.wav


## Py-Torch text extraction from audio 


1. **Loading the Model and Tokenizer**:
   - We load the pre-trained Wav2Vec 2.0 model (`Wav2Vec2ForCTC`) and its corresponding tokenizer (`Wav2Vec2Tokenizer`) from the Hugging Face `transformers` library.
   - The model used in this example is `facebook/wav2vec2-base-960h`, which is trained on 960 hours of English speech.

2. **Loading and Resampling Audio**:
   - We use `torchaudio` to load the audio file. The audio is read into the `speech` variable, and its sample rate is stored in `sample_rate`.
   - If the sample rate of the audio is not 16 kHz, we resample it using `torchaudio.transforms.Resample` to match the model's requirement.

3. **Tokenization**:
   - The audio waveform is tokenized into input values using the `tokenizer`. These input values are in a format that the model can process.
   - The `squeeze()` function is used to remove any extra dimensions from the tensor, and `return_tensors="pt"` ensures that the input is returned as a PyTorch tensor.

4. **Inference**:
   - The input values are passed through the model using `model(input_values)`. The model returns logits, which are raw, unnormalized scores for each possible output class (in this case, each character in the transcription).
   - We use `torch.no_grad()` to disable gradient calculations, which reduces memory usage and speeds up inference since we are not training the model.

5. **Decoding**:
   - The logits are converted into predicted IDs by finding the indices of the highest probability values using `torch.argmax(logits, dim=-1)`.
   - Finally, the predicted IDs are decoded into the transcription (text) using the `tokenizer.decode()` function.

6. **Output**:
   - The transcription is printed out, which is the text representation of the spoken words in the audio file.

In [9]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
torchaudio.set_audio_backend("sox_io")

  torchaudio.set_audio_backend("sox_io")


### loading facebook pre trained models 

In [10]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [11]:
print(train_df['path'][0])

data/dementia/Dan Ingram/daningram_15.wav


In [12]:
speech, sample_rate = torchaudio.load(train_df['path'][0])

if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    speech = resampler(speech)

input_values = tokenizer(speech.squeeze().numpy(), return_tensors="pt").input_values

with torch.no_grad():
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.decode(predicted_ids[0])

print(transcription)

LONG BEFORE THE BLACK YET BLACKOUT BLACKOUT WAS SUDDENLY EVERYTHIN INSTORED INTO SLOW DRIN THOSE DAYS THAT TAKE MACHINES WERE HOOKED UP TO SIXTY CYCLE CURRENT AND WHEN THE CURRENT STARTED GOING DOWN THEY STARTE ON DOWN TRUE FIVE TWENTY SEVEN IN THE MIDDLE OF HE NEWSCAST POO AND WERE GONE THAT WAS IT THAT HE GAVE ME AN ARMLOAD OF ELL PEES AND SAID GET IN THE CAR BECAUSE THE ONLY PLACE IN THE YORK CITY THAT HAD POWER WAS LOADI NEW JERSEY WHERE WE HAD OUR TRANSMITTE RIGHT SO WE WERE ON HERE AND I WAS ON ERTO FOUR THIRTY IN THE MORNING DOING MY FIRST TALKSHILL I INVENTED TALKED RADIO AND OTOS TI JUNLY DI HAVE A TURN TAVY I GOT OT ARM LOAD OF RECORDS AND NO TURN TO AND I WAS BROUGHT GASZING OUT OF THE TRAN


In [13]:
def transcribe_audio(path):
   speech, sample_rate = torchaudio.load(path)
   if sample_rate != 16000:
      resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
      speech = resampler(speech)
   input_values = tokenizer(speech.squeeze().numpy(), return_tensors="pt").input_values
   with torch.no_grad():
      logits = model(input_values).logits
   predicted_ids = torch.argmax(logits, dim=-1)
   transcription = tokenizer.decode(predicted_ids[0])
   return transcription

In [14]:
transcribe_audio(train_df['path'][0])

'LONG BEFORE THE BLACK YET BLACKOUT BLACKOUT WAS SUDDENLY EVERYTHIN INSTORED INTO SLOW DRIN THOSE DAYS THAT TAKE MACHINES WERE HOOKED UP TO SIXTY CYCLE CURRENT AND WHEN THE CURRENT STARTED GOING DOWN THEY STARTE ON DOWN TRUE FIVE TWENTY SEVEN IN THE MIDDLE OF HE NEWSCAST POO AND WERE GONE THAT WAS IT THAT HE GAVE ME AN ARMLOAD OF ELL PEES AND SAID GET IN THE CAR BECAUSE THE ONLY PLACE IN THE YORK CITY THAT HAD POWER WAS LOADI NEW JERSEY WHERE WE HAD OUR TRANSMITTE RIGHT SO WE WERE ON HERE AND I WAS ON ERTO FOUR THIRTY IN THE MORNING DOING MY FIRST TALKSHILL I INVENTED TALKED RADIO AND OTOS TI JUNLY DI HAVE A TURN TAVY I GOT OT ARM LOAD OF RECORDS AND NO TURN TO AND I WAS BROUGHT GASZING OUT OF THE TRAN'

In [15]:
# # making csv data of trancritpt and the class dementia 

# train_df['transcription'] = train_df['path'].apply(lambda x: transcribe_audio(x))
# valid_df['transcription'] = valid_df['path'].apply(lambda x: transcribe_audio(x))

# train_df.to_csv('data/train_dm_transcription.csv', index=False)
# valid_df.to_csv('data/valid_dm_transcription.csv', index=False)

### NOTE : due to time intensive work using assembli ai api for conversion 


In [19]:
import assemblyai as aai

# Replace with your API key
aai.settings.api_key = "687063c7417345c4b8de68a676b60714"
transcriber = aai.Transcriber()

In [22]:
def transcribe_audio(FILE_URL):
   transcript = transcriber.transcribe(FILE_URL)
   return transcript.text

In [23]:
transcribe_audio(train_df['path'][0])

"Long before the blackout. All right, yeah. Blackout. Blackout was suddenly everything started to slow down, because in those days, the tape machines were hooked up to 60 cycle current. And when the current started going down, they started going down. True. And at 527, in the middle of a newscast, poof. And we're gone. That was it. They gave me an armload of LP's and said, get in the car. Because the only place in New York City that had power was Lodi, New Jersey, where we had our transmitter, right? So we were on the air, and I was on the air till 430 in the morning doing my first talk show. I invented talk radio and nothing else to do. We didn't have a turntable. I got an armload of records and no turntable, and I was broadcasting out of the transde."

## Meathedology for Audio and Text binary classification <br>
<table border="1">
  <thead>
    <tr>
      <th>Classification</th>
      <th>Extension</th>
      <th>Representation</th>
      <th>Model</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Text</td>
      <td>.txt</td>
      <td>Transcript</td>
      <td>Sequential</td>
    </tr>
    <tr>
      <td>Audio</td>
      <td>.wav</td>
      <td>Dense Vector</td>
      <td>Transformer</td>
    </tr>
  </tbody>
</table>
