# Listening to the audio
This code enables you to listen to a sample sine wave audio.

In [7]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
import numpy
import torch
import csv
from sklearn.decomposition import PCA
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.externals import joblib

In [8]:
SAMPLE_RATE = 22050

In [9]:
T = 4.0    # seconds
t = np.linspace(0, T, int(T*SAMPLE_RATE), endpoint=False) # time variable
freq = 440
signal = np.sin(2*np.pi*freq*t)
#print len(signal)
ipd.Audio(signal, rate=SAMPLE_RATE)

# Creating spectrograms
Spectrograms provide a 2D feature space to analyze signals in the joint time-frequency domain.

This code generates the spectrogram files for use with the CNN example. It makes files for non-noised data!

In [10]:
f = open('train.csv', 'r')
reader = csv.reader(f)

# Get desired columns and rows from csv, 
# Each row is a sublist inside of lst
# Right now it is taking rows 0 to 70
# and columns 0:10
train = []

for row in itertools.islice(reader, 0, 6325):
    train.append(map(float, row[0:88201]))

train=np.array(train)
#Just some re-shaping and dimension finding
#N = 1
#print "N:",N
# train = signal[np.newaxis,:]
print "Train shape",train.shape
N_train = train.shape[0]
NUM_SAMPLES = train.shape[1]-1

X_train = train[:,:-1]
y_train = train[:,-1]
y_train = y_train.reshape(N_train,1)

print X_train.shape
print y_train

Train shape (6325, 88201)
(6325, 88200)
[[ 5.]
 [ 0.]
 [ 8.]
 ..., 
 [ 5.]
 [ 0.]
 [ 2.]]


In [None]:
pickle.dump(train, open( "train.p", "wb" ) )

In [7]:
# JUST SOME FOURIER TRANSFORM PARAMETERS
BINS_OCTAVE = 12*2
N_OCTAVES = 7
NUM_BINS = BINS_OCTAVE * N_OCTAVES

In [8]:
# Given a wav time series, makes a mel spectrogram
# which is a short-time fourier transform with
# frequencies on the mel (log) scale.
def mel_spec(y):
    Q = librosa.cqt(y=y, sr=SAMPLE_RATE, bins_per_octave=BINS_OCTAVE,n_bins=NUM_BINS)
    Q_db = librosa.amplitude_to_db(Q,ref=np.max)
    return Q_db

In [10]:
# This means that the spectrograms are 168 rows (frequencies)
# By 173 columns (time frames)
i=19
song = X_train[i]
print y_train[i]


#print len(song)
#ipd.Audio(song, rate=SAMPLE_RATE)
#print(song)
#test_spec = mel_spec(song)
test_spec = librosa.feature.mfcc(song)
print test_spec
FEATS = test_spec.shape[0]
FRAMES = test_spec.shape[1]
print FEATS
print FRAMES

[ 0.]
[[-264.43353185 -262.19031393 -267.36055129 ..., -265.9187     -261.88793023
  -263.07349524]
 [ 136.19503887  135.43527643  129.18106832 ...,  127.70202003
   130.88045522  136.15870314]
 [ -32.94150493  -37.6044777   -34.96458227 ...,  -38.10535231
   -39.66040806  -33.65282637]
 ..., 
 [   9.37160854   11.23073764   10.66960619 ...,   10.54576475
    10.31790495    6.6092075 ]
 [  13.03338839   13.69946622    9.79281914 ...,    8.79209023
     8.81637504    7.88480787]
 [  -2.55063297    1.54369719    3.02408025 ...,   -8.33781475   -3.6540764
    -2.43447167]]
20
173


In [12]:
tmp_train = np.zeros((N_train,FEATS*FRAMES))

for i in tqdm.tqdm(range(N_train)):
    test_spec = librosa.feature.mfcc(X_train[i])
    tmp_train[i,:] = [item for sublist in test_spec for item in sublist] #mel_spec(X_train[i])
#np.save('Data/xtrain_spec', tmp_train)


100%|██████████| 6325/6325 [01:19<00:00, 79.30it/s] 


In [13]:
rf = RandomForestClassifier()
rf.fit(tmp_train,y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
rf.score(tmp_train, y_train)

1.0

In [35]:
f = open('train.csv', 'r')
reader = csv.reader(f)
test = []
for row in itertools.islice(reader, 100, 200):
    test.append(map(float, row[0:88201]))
test=np.array(test)


In [36]:
X_test = test[:,:-1]
y_test = test[:,-1]

In [37]:
N_test = test.shape[0]
tmp_test = np.zeros((N_test,FEATS*FRAMES))

print test.shape

for i in range(N_test):
    test_spec = librosa.feature.mfcc(X_test[i])
    tmp_test[i,:] = [item for sublist in test_spec for item in sublist] #mel_spec(X_train[i])

(100, 88201)


In [38]:
print tmp_test.shape
print y_test.shape
rf.score(tmp_test, y_test)

(100, 3460)
(100,)


0.63

In [18]:
f = open('test.csv', 'r')
reader = csv.reader(f)
test = []
for row in tqdm.tqdm(itertools.islice(reader, 0, 1000)):
    test.append(map(float, row[1:88201]))
test=np.array(test)

#y_test = test[:,-1]

N_test = test.shape[0]
tmp_test = np.zeros((N_test,FEATS*FRAMES))

print test.shape

for i in range(N_test):
    test_spec = librosa.feature.mfcc(test[i])
    tmp_test[i,:] = [item for sublist in test_spec for item in sublist] #mel_spec(X_train[i])


0it [00:00, ?it/s][A
1it [00:00,  6.47it/s][A
3it [00:00, 10.08it/s][A
5it [00:00, 11.96it/s][A
7it [00:00, 12.94it/s][A
9it [00:00, 13.78it/s][A
11it [00:00, 14.41it/s][A
14it [00:00, 15.74it/s][A
16it [00:01, 15.96it/s][A
18it [00:01, 16.08it/s][A
21it [00:01, 16.54it/s][A
23it [00:01, 16.67it/s][A
26it [00:01, 17.06it/s][A
28it [00:01, 17.02it/s][A
31it [00:01, 17.33it/s][A
33it [00:01, 17.31it/s][A
36it [00:02, 17.70it/s][A
39it [00:02, 17.96it/s][A
42it [00:02, 18.00it/s][A
45it [00:02, 18.19it/s][A
48it [00:02, 18.21it/s][A
51it [00:02, 18.48it/s][A
54it [00:02, 18.43it/s][A
57it [00:03, 18.45it/s][A
59it [00:03, 18.32it/s][A
61it [00:03, 18.24it/s][A
64it [00:03, 18.42it/s][A
67it [00:03, 18.47it/s][A
69it [00:03, 18.40it/s][A
71it [00:03, 18.35it/s][A
73it [00:03, 18.32it/s][A
75it [00:04, 18.30it/s][A
78it [00:04, 18.40it/s][A
80it [00:04, 18.36it/s][A
82it [00:04, 18.32it/s][A
84it [00:04, 18.32it/s][A
86it [00:04, 18.29it/s][A
88it [00:0

(1000, 88200)


In [19]:
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

In [32]:
joblib.dump(rf, 'rf.pickle')

['rf.pickle']

In [20]:
preds = rf.predict(tmp_test)
test_ids = range(len(preds))
write_predictions(preds, test_ids, 'first_submission.csv')

In [22]:
lr = LogisticRegression()
lr.fit(tmp_train, y_train)
preds = lr.predict(tmp_test)
write_predictions(preds, test_ids, 'logreg_submission.csv')

  y = column_or_1d(y, warn=True)


In [33]:
joblib.dump(lr, 'lr.pickle')

['lr.pickle']

In [24]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(tmp_train, y_train)
preds = clf.predict(tmp_test)
write_predictions(preds, test_ids, 'NNsubmission.csv')

  y = column_or_1d(y, warn=True)


In [34]:
joblib.dump(clf, 'nn.pickle')

['nn.pickle']

In [None]:
Xtrain = tmp_train
Ytrain = y_train
best_depth = None
best_num_features = None
best_score = float("-inf")
tot_features = Xtrain.shape[1]
kfold = KFold(n_splits = 5)
scores = []
# def accuracy(preds, actual):
#     diff = preds - actual
#     n = len(actual) * 1.0
#     return 1 - (np.count_nonzero(diff) / n)


for depth in range(10, 200, 10):
        for max_feat in tqdm.tqdm(range(10, 200, 10)):
            kscores = []
            for train_ind, test_ind in kfold.split(Xtrain):
                xtrain_cv = Xtrain[train_ind]
                ytrain_cv = Ytrain[train_ind]

                xtest_cv = Xtrain[test_ind]
                ytest_cv = Ytrain[test_ind]

                rf = RandomForestClassifier(max_depth = depth, max_features = max_feat)
                rf.fit(xtrain_cv, ytrain_cv)
                kscores.append(rf.score(xtest_cv, ytest_cv))
#                 preds = rf.predict(xtest_cv)
#                 kscores.append(accuracy(preds, ytest_cv))

            score = np.mean(kscores)
            scores.append((depth, max_feat, score))
            if score > best_score:
                best_score = score
                best_depth = depth
                best_num_features = max_feat

print("Best depth:", best_depth)
print("Best features", best_num_features)
print("scores", scores)

100%|██████████| 19/19 [04:34<00:00, 14.46s/it]
100%|██████████| 19/19 [05:57<00:00, 18.82s/it]
 74%|███████▎  | 14/19 [03:45<01:20, 16.13s/it]