Import modules

In [62]:
import os
import pandas as pd 
from pathlib import Path
import string
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

find_files is given a directory. It creates a database containing the name and relative path for each file

In [26]:
 def find_files(subfolder_path):
    #This code creates a database with every file and its path
    dir = Path(os.getcwd())
    #Add directory where your files are:
    newdir = dir / subfolder_path


    #subfolders = os.listdir(newdir)
    subfolders = ['Train_data', 'Test_data']
    dense_list = [os.listdir(newdir / subfolder) for subfolder in subfolders]
    paired_list = zip(dense_list, subfolders)

    audio_files = [(item, label) for sublist, label in paired_list for item in sublist]
    audio_file_list, path_list = zip(*audio_files)
    columns = ['file', 'relative_path']

    common_prefix = os.path.commonprefix([dir, newdir])
    relative_path = os.path.relpath(newdir, common_prefix)
    print(relative_path)

    #Using os.join.path and Path() leads to a Windows path, so I had to do it this way
    relative_path = [relative_path + '/' + path for path in path_list]
    print(relative_path)


    df = pd.DataFrame(columns = columns)
    print(audio_file_list)
    print(relative_path)
    df.file = audio_file_list
    df.relative_path = relative_path
    return df

# df.to_csv(dir / 'audio_database.csv', encoding = 'utf-8')

In [27]:
df = find_files('Feature_representations')

Feature_representations
['Feature_representations/Train_data', 'Feature_representations/Test_data']
('Train_1.csv', 'Test_1.csv')
['Feature_representations/Train_data', 'Feature_representations/Test_data']


In [28]:
test_ind = list(map(lambda x: 'test' in x, list(df['file'].apply(lambda x: x.lower()))))
df['test'] = df['file'][[i for i, x in enumerate(test_ind) if x]]
df['train'] = df['file'][[i for i, x in enumerate(test_ind) if not x]]
# df.drop(['file'], axis = 1, inplace = True)
print(df)

          file                       relative_path        test        train
0  Train_1.csv  Feature_representations/Train_data         NaN  Train_1.csv
1   Test_1.csv   Feature_representations/Test_data  Test_1.csv          NaN


loadGloveData takes the dimension of the Glove word vector as input. In creates a numpy version of the word vectors that is saved to the disk. 

In [29]:
def loadGloveData(cur_dim):

    #%% Set path to Glove word vector folder
    dir = Path(os.getcwd())
    wvpack = "glove.6B."+str(cur_dim)+"d.txt"
    file_1 = dir / "glove.6B" / wvpack

    df = pd.read_csv(file_1, sep=" ", quoting=3, header=None, index_col=0)
    WV = {key: val.values for key, val in df.T.items()}
    file_2 = os.path.join(dir,'glove_dic','wv_dic_{}.npy'.format(cur_dim))
    np.save(file_2, WV) 

In [30]:
dim = [50, 100, 200, 300]
dir = Path(os.getcwd())
print(dir)

for example in dim:
    fname = os.path.join(dir,'glove_dic','wv_dic_{}.npy'.format(example))
    if not os.path.isfile(fname):
        print(example)
        loadGloveData(example)

C:\Users\marti\Documents\GitHub\Data Pipeline\Test_data


Create_train_test_w2v_matrices takes the dataframe containing the path to each train and test csv file, which is output by find_files. This function completes the processing required to create the word averaged representation of each piece of input data. The train and test sections coerce the input dataframe into the correct format, and the input is then feed to the docAveraging function. The final matrix representation is saved to the disk.

In [58]:
def create_path(*args):
    cur_path = os.getcwd()
    for value in args:
        cur_path  = os.path.join(cur_path, value)
    return cur_path

class Data:
    
    table = str.maketrans({key: None for key in string.punctuation})
    def __init__(self, file, rel_path):
        self.quest_num = file[-5]
        if 'test' in file.lower():
            self.mat_type = 'test'
            #Test Set
            test = pd.read_csv(create_path(rel_path, file))
            column_names = test.iloc[0,:]
            test.drop([0], inplace=True)
            test.rename(columns = column_names, inplace=True)
            test.drop(columns = ['Section'], inplace=True)
            test.dropna(inplace=True)
            test['labels'] = test[test.columns[1:]].apply(
                lambda x: ''.join(x.astype(str)),axis=1)
            test.drop(column_names[2:], axis=1, inplace=True)
            test.drop(test[test.labels=='eee'].index, inplace=True)
            self.X = test['Dialogue'].apply(lambda x : x.lower().translate(Data.table))
            self.Y = test['labels']
        #Train set
        elif 'train' in file.lower():
            self.mat_type = 'train'
            #Train set
            train = pd.read_csv(create_path(rel_path, file))
            train.drop([train.columns[0]], axis = 1, inplace=True)
            train.dropna(inplace=True)
            train['labels']=train.labels.apply(lambda x: ''.join([(3-len(str(x)))*'0',str(x)]))
            self.X = train['Dialogue'].apply(lambda x : x.lower().translate(Data.table))
            self.Y = train['labels']

In [91]:
def save_labels(Y, out_path):
    if not os.path.isfile(out_path):
        np.save(out_path, Y)

def Create_TFIDF_matrices(df):
    test_df = df[['test', 'relative_path']].dropna()
    train_df = df[['train', 'relative_path']].dropna()
    test_df.reset_index(inplace = True)
    train_df.reset_index(inplace = True)
    if len(test_df) <= len(train_df):
        for i in range(min([len(test_df), len(train_df)])):  
            train_data = Data(train_df.train[i], train_df.relative_path[i])
# Attributes of train_data:   mat_type, quest_num, X_train, Y_train 
            #Sanity Check
            if train_data.mat_type != 'train' or train_data.quest_num != str(i+1):
                print('Expected matrix type train, received type {}'.format(train_data.mat_type))
                print('Expected question # {}, received # {}'.format(i+1, train_data.quest_num))
                print('error')
            test_data = Data(test_df.test[i], test_df.relative_path[i])
            #Sanity Check
            if test_data.mat_type != 'test' or test_data.quest_num != str(i+1):
                print('Expected matrix type test, received type {}'.format(test_data.mat_type))
                print('Expected question # {}, received # {}'.format(i+1, test_data.quest_num))
                print('error')
            Y_train = train_data.Y
            Y_test = test_data.Y
            #Create TF-IDF Matrices (we want to tokenize the text before creating these)
            X_train = train_data.X.apply(lambda x: word_tokenize(x))
            X_test = test_data.X.apply(lambda x: word_tokenize(x))
            #Flatten lists
            X_train = [item for sublist in X_train for item in sublist]
            X_test = [item for sublist in X_test for item in sublist]
            print(X_test)
            for dim in range (100, 200, 100): #max 2100
                tfidf_vectorizer = TfidfVectorizer(max_features = dim)
                tfidf_matrix = tfidf_vectorizer.fit_transform(X_train).toarray()
                tfidf_matrix_Test = tfidf_vectorizer.transform(X_test).toarray()
                file_4 = create_path('tfidf_matrices','TFIDF_train_Question{}_{}dim.npy'.format(train_data.quest_num, dim))
                file_5 = create_path('tfidf_matrices','TFIDF_test_Question{}_{}dim.npy'.format(test_data.quest_num, dim))
                np.save(file_4, tfidf_matrix)
                np.save(file_5, tfidf_matrix_Test)
            save_labels(Y_train, create_path('labels', '{}_labels_question{}'.format(train_data.mat_type, train_data.quest_num)))
            save_labels(Y_test, create_path('labels', '{}_labels_question{}'.format(test_data.mat_type, test_data.quest_num)))

def Create_glove_w2v_matrices(df):
    def docAveraging(sent, WV, dim):
        summ = [0.0] * (dim)
        A = 0.0;
        sent_A = (re.sub(r"[\n(\[\])]", "", sent)).split(" ")
        for word in sent_A:
            if word in WV : #and word not in stop:
                A = A + 1.0
                for i in range(0, dim):
                    summ[i] = summ[i] + float((WV[word])[i])
        if A != 0:
            #A = 1
            for i in range(0, dim):
                summ[i] = summ[i] / A
        return summ;
    
    dim = [50, 100, 200, 300]
    for i in range(len(df)): 
        file = df.file[i]
        relative_path = df.relative_path[i]
        mat_type, quest_num, X, Y =  load_data(file, relative_path)    
        #Create w2v average matrices
        for wvsize in dim:
            file_2 = create_path('glove_dic','wv_dic_{}.npy'.format(wvsize))
            WV = np.load(file_2).item() 
            ttMatrix = np.zeros((0, wvsize))
            print('Current word vector size: {}'.format(wvsize))
            print('Current question: {} {}'.format(mat_type, quest_num))
            for train_doc in X:
                ttMatrix = np.append(ttMatrix, [np.asarray(docAveraging(train_doc, WV, wvsize))], axis=0)#.decode('utf8').strip()), WV, dim))], axis=0)
            file_3 = create_path('w2v_matrices','Question{}{}_{}dimensions.npy'.format(quest_num,mat_type,wvsize))
            np.save(file_3, ttMatrix) 
            save_labels(Y, create_path(df.relative_path[i], '{}labels_question{}'.format(mat_type, quest_num)))    

In [92]:
Create_TFIDF_matrices(df)

['good', 'morning', 'mornin', 'capt', 'good', 'morning', 'capt', 'wang', 'i', 'am', 'here', 'to', 'discuss', 'with', 'you', 'and', 'the', 'platoon', 'about', 'important', 'business', 'hello', 'captain', 'wang', 'it', 'is', 'nice', 'to', 'see', 'you', 'this', 'morning', 'good', 'morning', 'captain', 'wang', 'it', 'is', 'an', 'honor', 'to', 'meet', 'you', 'i', 'would', 'like', 'to', 'discuss', 'some', 'important', 'decisions', 'that', 'need', 'to', 'be', 'made', 'good', 'morning', 'sir', 'i', 'look', 'forward', 'to', 'giving', 'you', 'any', 'assistance', 'you', 'need', 'today', 'captain', 'wang', 'good', 'morning', 'good', 'morning', 'captain', 'wang', 'my', 'rank', 'may', 'be', 'lower', 'than', 'you', 'expected', 'but', 'i', 'speak', 'for', 'those', 'with', 'much', 'higher', 'rank', 'good', 'morning', 'captain', 'wang', 'how', 'are', 'you', 'doing', 'on', 'this', 'beautiful', 'day', 'i', 'am', 'pleased', 'to', 'meet', 'you', 'good', 'morning', 'captain', 'wang', 'good', 'morning', 'capt

In [7]:
import os

cur_path = create_path('hello', 'chicken')
print(cur_path)

C:\Users\marti\Documents\GitHub\Data Pipeline\Test_data\hello\chicken


In [11]:
os.path.isfile(create_path('feature_representations', 'Features', 'alkdsfdsajf.txt'))

True