In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path

# Specify the dataset folder
data_dir = Path("../../dataset/")
out_dir = Path("../../output/")

clips_dir = "../input/sep28k/clips/stuttering-clips/clips"
sep_labels = "../input/sep28k/SEP-28k_labels.csv"

In [None]:
import pandas as pd

df_labels = pd.read_csv(sep_labels)

In [None]:
df_labels.head(10)

In [None]:
df_labels.isnull().any()

In [None]:
df_labels.describe()

In [None]:
df_labels["filename"] = df_labels[df_labels.columns[0:3]].apply(
    lambda x: "_".join(x.dropna().astype(str)),
    axis=1,
)

In [None]:
df_labels.head(10)

In [None]:
df_labels = df_labels.sort_values(by='filename')
df_labels.head(10)

In [None]:
audio_ignore_list = []

In [None]:
import os
from tqdm.notebook import tqdm as tqdm_notebook

from pathlib import Path


for f_name in os.listdir(clips_dir):
    f_path = Path(clips_dir) / f_name                 # means  CLIPS_DIR + filename

    if os.stat(f_path).st_size == 44:
        audio_ignore_list.append(f_name)

        f_name = f_name[:-4]
        df_labels = df_labels[df_labels.filename != f_name]

In [None]:
for f_name in tqdm_notebook(os.listdir(clips_dir)):
    if "FluencyBank" in f_name:
        audio_ignore_list.append(f_name)

In [None]:
from tqdm import tqdm_notebook
import librosa
import numpy as np

mfcc_features = {}

for f_name in tqdm_notebook(os.listdir(clips_dir)):
    if f_name not in audio_ignore_list:
        audio, sample_rate = librosa.load(
            Path(clips_dir) / f_name,
            res_type="kaiser_fast",
            sr=None,
        )
        mfcc_s = np.mean(
            librosa.feature.mfcc(
                y=audio,
                sr=sample_rate,
                n_mfcc=40,
            ).T,
            axis=0,
        )
        mfcc_features[f_name] = mfcc_s

In [None]:
df = pd.DataFrame.from_dict(mfcc_features).transpose().reset_index()

# Saving the dataframe
df.to_csv("mfcc_features.csv")

In [None]:
df.isnull().any().values

In [None]:
df = df.sort_values(by='index')

In [None]:
df = pd.read_csv('mfcc_features.csv')

df.head()

In [None]:
df.columns

In [None]:
df1 = df.drop('Unnamed: 0',axis=1)
df1.head()

In [None]:
df1 = df1.sort_values(by='index')
df1

In [None]:
df_labels['Prolongation'].unique()

In [None]:
df1['Prolongation'] = df_labels['Prolongation']
df1

In [None]:
# Replacing all number which is greater then 1 with 1, and less then or equal to 1 with 0
df1.loc[df1["Prolongation"] <= 1.0 , "Prolongation"] = 0
df1.loc[df1["Prolongation"] >= 2.0  , "Prolongation"] = 1

In [None]:
df1 = df1.dropna()

In [None]:
df1.isnull().any().values

In [None]:
df1['Prolongation'].unique()

In [None]:
df1

In [None]:
# train test split

x = df1.drop(["index","Prolongation"],axis=1)
y = df1["Prolongation"]

In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
# Model making
from keras.layers import Dense, LSTM, Embedding
from keras import Input
from tensorflow import keras
from keras.models import Sequential

model = Sequential()

model.add(Input(shape=(1,)))

model.add(Embedding(40, 128))

model.add(LSTM(64))

model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
x_train.iloc[[240]]


In [None]:

# Early Stopping is a very different way to regularize the machine learning model. The way it does is to stop
# training as soon as the validation error reaches a minimum. The figure below shows a model being trained.

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_accuracy', 
    patience=8, 
    min_delta=0.001, 
    mode='max'
)

model.fit(x_train, y_train, batch_size=256, epochs=50, validation_split=0.25)

In [None]:
#model.fit(X_train,y_train,batch_size=256,validation_split=0.25,epochs=10)
#model.fit(x_train,y_train, epochs=10,validation_data=(x_test, y_test), verbose=1)


In [None]:
acc = model.evaluate(x_train,y_train,verbose=1)
print(acc)