In [1]:
!pip install --user openai-whisper
!pip install --user pandas scikit-learn

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
     ---------------------------------------- 0.0/803.2 kB ? eta -:--:--
     -------------------------------------- 803.2/803.2 kB 5.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting more-itertools (from openai-whisper)
  Downloading more_itertools-10.8.0-py3-none-any.whl.metadata (39 kB)
Collecting numba (from openai-whisper)
  Downloading numba-0.63.0-cp312-cp312-win_amd64.whl.metadata (3.0 kB)
Collecting llvmlite<0.47,>=0.46.0dev0 (from numba->openai-whisper)
  Downloading llvmlite-0.46.0-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Downloading more_itertools-10.8.0-py3-none-any.whl (69 kB)
Downloadi


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
import whisper
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
DATA_DIR = "data"

device = "cpu"
print("Using device:", device)
print("Data dir:", DATA_DIR)

asr_model = whisper.load_model("tiny", device=device)
print("Whisper model loaded.")

Using device: cpu
Data dir: data


100%|█████████████████████████████████████| 72.1M/72.1M [00:04<00:00, 15.7MiB/s]


Whisper model loaded.


In [4]:
def transcribe_file(path, model, device="cpu"):
    result = model.transcribe(path, fp16=False, language="en")
    text = result.get("text", "").strip()
    return text

In [7]:
import os

ffmpeg_bin = r"C:\Users\pranav.vetkar\ffmpeg\bin"
os.environ["PATH"] = ffmpeg_bin + os.pathsep + os.environ["PATH"]

import shutil
print("ffmpeg path now:", shutil.which("ffmpeg"))

ffmpeg path now: C:\Users\pranav.vetkar\ffmpeg\bin\ffmpeg.EXE


In [8]:
records = []

valid_labels = ["fraud", "loan issue", "general query"]

for label in os.listdir(DATA_DIR):
    folder = os.path.join(DATA_DIR, label)
    if not os.path.isdir(folder):
        continue

    if label not in valid_labels:
        print("Skipping unknown folder:", label)
        continue

    for fname in os.listdir(folder):
        if not fname.lower().endswith((".mp3", ".wav", ".m4a", ".flac", ".ogg")):
            continue

        fpath = os.path.join(folder, fname)
        print(f"Transcribing [{label}] {fpath} ...")

        if not os.path.exists(fpath):
            print("  PATH ERROR: file does not exist on disk!", fpath)
            continue

        try:
            text = transcribe_file(fpath, asr_model, device=device)
        except Exception as e:
            print("  ERROR while transcribing:", repr(e))
            continue

        if not text.strip():
            print("  Empty transcript, skipping.")
            continue

        records.append({
            "file": fpath,
            "intent": label,
            "text": text
        })

import pandas as pd
df = pd.DataFrame(records)
print("\nTotal samples:", len(df))
df.head()

Transcribing [fraud] data\fraud\fraud1.mp3 ...
Transcribing [fraud] data\fraud\fraud2.mp3 ...
Transcribing [fraud] data\fraud\fraud3.mp3 ...
Transcribing [fraud] data\fraud\fraud4.mp3 ...
Transcribing [fraud] data\fraud\fraud5.mp3 ...
Transcribing [fraud] data\fraud\fraud6.mp3 ...
Transcribing [fraud] data\fraud\fraud7.mp3 ...
Transcribing [general query] data\general query\genqry1.mp3 ...
Transcribing [general query] data\general query\genqry10.mp3 ...
Transcribing [general query] data\general query\genqry2.mp3 ...
Transcribing [general query] data\general query\genqry3.mp3 ...
Transcribing [general query] data\general query\genqry4.mp3 ...
Transcribing [general query] data\general query\genqry5.mp3 ...
Transcribing [general query] data\general query\genqry6.mp3 ...
Transcribing [general query] data\general query\genqry7.mp3 ...
Transcribing [general query] data\general query\genqry8.mp3 ...
Transcribing [general query] data\general query\genqry9.mp3 ...
Transcribing [loan issue] data

Unnamed: 0,file,intent,text
0,data\fraud\fraud1.mp3,fraud,I did not authorize this large international p...
1,data\fraud\fraud2.mp3,fraud,I noticed a suspicious log in attempt on my mo...
2,data\fraud\fraud3.mp3,fraud,I was scammed and descending a transfer after ...
3,data\fraud\fraud4.mp3,fraud,My entire savings account balance was withdraw...
4,data\fraud\fraud5.mp3,fraud,I received a phone call asking for my pin and ...


In [9]:
X = df["text"].values
y = df["intent"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("\nTrain label distribution:")
print(pd.Series(y_train).value_counts())

Train size: 20
Val size: 5

Train label distribution:
general query    8
loan issue       6
fraud            6
Name: count, dtype: int64


In [10]:
clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=1,
        max_df=0.9
    )),
    ("logreg", LogisticRegression(
        max_iter=1000,
        multi_class="auto"
    ))
])

clf.fit(X_train, y_train)
print("Intent classifier trained on transcripts.")

Intent classifier trained on transcripts.




In [11]:
y_pred = clf.predict(X_val)

print("Classification report:")
print(classification_report(y_val, y_pred))

print("Confusion matrix (rows=true, cols=pred):")
labels_order = sorted(df["intent"].unique())
print(labels_order)
print(confusion_matrix(y_val, y_pred, labels=labels_order))

Classification report:
               precision    recall  f1-score   support

        fraud       0.00      0.00      0.00         1
general query       0.67      1.00      0.80         2
   loan issue       1.00      0.50      0.67         2

     accuracy                           0.60         5
    macro avg       0.56      0.50      0.49         5
 weighted avg       0.67      0.60      0.59         5

Confusion matrix (rows=true, cols=pred):
['fraud', 'general query', 'loan issue']
[[0 1 0]
 [0 2 0]
 [1 0 1]]


In [13]:
def predict_intent_from_audio(path, asr_model, clf_pipeline, device="cpu"):
    text = transcribe_file(path, asr_model, device=device)
    if not text.strip():
        raise ValueError("Empty transcript from ASR.")
    
    intent = clf_pipeline.predict([text])[0]
    proba = clf_pipeline.predict_proba([text])[0]
    labels = clf_pipeline.classes_
    prob_dict = {lbl: float(p) for lbl, p in zip(labels, proba)}
    
    return text, intent, prob_dict

In [14]:
test_row = df.sample(1).iloc[0]
test_path = test_row["file"]

print("Testing on file:", test_path)

text, intent, probs = predict_intent_from_audio(test_path, asr_model, clf)

print("\n=== TRANSCRIPT ===")
print(text)

print("\n=== PREDICTED INTENT ===")
print(intent)

print("\n=== CLASS PROBABILITIES ===")
for k, v in probs.items():
    print(f"{k:15s}: {v:.3f}")

Testing on file: data\fraud\fraud1.mp3

=== TRANSCRIPT ===
I did not authorize this large international payment that just posted to my checking account.

=== PREDICTED INTENT ===
fraud

=== CLASS PROBABILITIES ===
fraud          : 0.474
general query  : 0.311
loan issue     : 0.215


In [15]:
TEST_DIR = "data/test"

test_files = [
    os.path.join(TEST_DIR, f)
    for f in os.listdir(TEST_DIR)
    if f.lower().endswith((".mp3", ".wav", ".m4a", ".ogg", ".flac"))
]

print("Found test files:", len(test_files))
for f in test_files:
    print(" -", f)

Found test files: 5
 - data/test\test1.mp3
 - data/test\test2.mp3
 - data/test\test3.mp3
 - data/test\test4.mp3
 - data/test\test5.mp3


In [16]:
print("\n========== TEST SET EVALUATION ==========\n")

for fpath in test_files:
    print("\n-----------------------------------------")
    print("File:", fpath)

    try:
        text, intent, probs = predict_intent_from_audio(
            fpath, asr_model, clf, device=device
        )
    except Exception as e:
        print("ERROR:", e)
        continue

    print("\nTranscript:")
    print(text)

    print("\nPredicted Intent:", intent)

    print("\nClass Probabilities:")
    for lbl, p in probs.items():
        print(f"  {lbl:15s} : {p:.3f}")




-----------------------------------------
File: data/test\test1.mp3

Transcript:
I see a transaction for a subscription service I canceled last month.

Predicted Intent: general query

Class Probabilities:
  fraud           : 0.284
  general query   : 0.409
  loan issue      : 0.307

-----------------------------------------
File: data/test\test2.mp3

Transcript:
My business loan requires a collateral valuation, but the bank's assigned firm is non-responsive.

Predicted Intent: loan issue

Class Probabilities:
  fraud           : 0.241
  general query   : 0.340
  loan issue      : 0.419

-----------------------------------------
File: data/test\test3.mp3

Transcript:
Someone used my account information to purchase gift cards online without my permission.

Predicted Intent: general query

Class Probabilities:
  fraud           : 0.335
  general query   : 0.415
  loan issue      : 0.250

-----------------------------------------
File: data/test\test4.mp3

Transcript:
Could you send me