In [3]:
import pathlib
import timeit
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


# ---------------- DEFINE FUNCTIONS ----------------

# Train the SVM on the training data and predict using the test data
def predict_svm(x_te, x_tr, y_tr):
    every_predictions = []
    svc = svm.SVC()
    svc.fit(x_tr, y_tr)
    prediction = svc.predict(x_te)
    every_predictions.append(prediction)
    return every_predictions


# Check the accuracy of given predictions on the test set y_test
def check_accuracy(y_test, predictions):
    ground_truth = y_test.to_list()
    size = len(ground_truth)
    lst = []

    for predict in predictions:
        count = 0
        for i, j in enumerate(ground_truth):
            if predict[i] == ground_truth[i]:
                count += 1
        lst.append(count / size)
    return lst


# ---------------- PREPARE DATA ----------------

# Read the data
dirPath = "../Datasets/purchase600-100cls-15k.lrn.csv"

In [None]:
# PLOT TIME VS DATASET SIZE

selectionSizeRange = list(range(1, 11, 1))
runtime = [] 
for size in selectionSizeRange:
    df=pd.read_csv(dirPath)
    df = df.sample(int(math.floor(df.shape[0]*size/10)), random_state=35)
#     print(df.head())

    # Split into input and target variables
    X = df.iloc[:, 1:-1]  # Remove the ID and Class columns
    Y = df.iloc[:, -1]


    df_x_scaled = X
    
    KList = [1, 5, 10, 50, 100, 1000]

    X_train, X_test, Y_train, Y_test = train_test_split(df_x_scaled, Y, test_size=0.1 , random_state=35)

    # RANDOM FORESTS
    start = timeit.default_timer()
    all_predictions = predict_svm(X_test, X_train, Y_train)
    stop = timeit.default_timer()
    time = stop - start
    print('Time: ', time)
    runtime.append(time)

fig = plt.figure()
newList = [x / 10 for x in selectionSizeRange]
plt.scatter(newList, runtime)
fig.suptitle('Time vs Dataset size', fontsize=14)
plt.xlabel('Dataset Size', fontsize=14)
plt.ylabel('Time', fontsize=14)
plt.savefig('Purchase_SVM_plot_timeVSdatasetsize.png')
plt.show()

Time:  0.2862850999999864
Time:  0.9336797000000843
Time:  2.160632600000099
Time:  3.8046401000001424
Time:  5.852266000000327
Time:  8.343380499999967
Time:  10.94445880000012
Time:  13.875937899999826
Time:  16.933915999999954
