# QuickML Documentation

## 1. Installing VMWare

VMWare, or an equivalent (VirtualBox, etc.) needs to be installed to be able to run virtual envrionments.


## 2. Data Pre-Processing 

The first step to creating a machine learning model is preparing the data to be fed into it by pre-processing. The data needs to be pre-processed and the following steps followed:

1. Acquire the Dataset 
2. Import Necessary Libraries 
3. Import the Dataset
4. Handling Missing Values
5. Encoding Categorical Data
6. Splitting into Training and Test Set
7. Feature Scaling

In [2]:
# Importing All Libraries
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Mapping independent, dependent, categorical and missing data
# to begin data pre-processing.
var_map = {
    "independent" : ["R&D Spend", "Administration","Marketing Spend", "State"],
    "dependent" : ["Profit"],
    "categorical" : ["State"],
    "missing": ["Marketing Spend"]
}

In [4]:
# Defining Function 
def dataPreProcess(dataSet, varMap):
    # Obtaining Data Set
    data_root = pd.read_csv(dataSet)
    data = data_root.copy()

    # Splitting Dependent & Independent Variables
    X = data[varMap['independent']]  
    y = data[varMap['dependent']]

    # Removing any missing data
    imputer = SimpleImputer(missing_values=np.nan , strategy='mean')
    imputer = imputer.fit(X[varMap['missing']])
    X[varMap['missing']] =imputer.transform(X[varMap['missing']])

    # Encoding Categorical Variables
    le = LabelEncoder()
    X[varMap['categorical']]= pd.DataFrame(le.fit_transform(X[varMap['categorical']]))
    col_tans = make_column_transformer( 
                         (OneHotEncoder(), 
                         varMap['categorical']))
    Xtemp2 = col_tans.fit_transform(X[varMap['categorical']])
    # Splitting Into Train and Test Set 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 , random_state = 0)

    # Feature Scaling
    scale_X = StandardScaler()
    X_train.iloc[: , :] = scale_X.fit_transform(X_train.iloc[: , :])
    X_test.iloc[: , :] = scale_X.fit_transform(X_test.iloc[: , :])

    # Returns a dictionary of pre-processed data
    return(
        {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_train
        }
    )

The data processing function is responsible for taking a dataset and a mapping of dependent, independent, missing and categorical data. The dataset is split into the dependent and independent data, the missing data is taken care of, and the categorical data is encoded.

Finally, the data is split into the test and train and it is feature scaled. The function returns a dictionary of the train and test matrices and vectors ready for a machine learning model to be fitted on. 

### 2.1. Dynamic Table Creation

Once the Algorithm of choice is selected, an HTML table is dynamically created with the column names:
1. Independent
2. Dependent
3. Categorical 
As well as dynamically created row names which correspond to the attributes in the inputted data set. This was done using Flask and Jinja. 

Additionally, the file the user submits is saved to a specific folder, effectively keeping a reference to this file to be used later on. 

In [None]:
# Invoked when user submits file - 
# creates HTML table with attributes of file
@views.route('/', methods=['POST'])
def upload_file():
    
    global filename
    file = request.files['file']
    
    # Saves the file so it can be accessed later on.
    dataSet = pd.read_csv(file)
    file.save(os.path.join(UPLOAD_FOLDER, file.filename))
    filename = file.filename

    return render_template('home.html', attributes = list(dataSet.columns))

In [None]:
 {% for i in attributes: %}
                <tr>
                    <th class="rt" value='{{i}}'>{{i}}</th>
                    <td><input class='rd' type="radio" name='test_{{attributes.index(i)}}' 
                               value="Ind"></td>
                    <td><input class='rd dep' type="radio" name='test_{{attributes.index(i)}}'
                               value="Dep"></td>
                    <td><input class='rd' type="checkbox" value="Cat"></td>
                </tr>
{% endfor %}
<!-- Using Jinja python expressions can be written in html. 
    Table created dynamically using for loop. -->

### 2.2. Dynamic Creation of Variable Mapping

The input of the radio buttons and checkboxes on the dynamically created table are used to create the mapping of attributes. Namely, the user selects which attributes are dependent, independent, and which are categorical. 

In [None]:
function makeVarMap() {

  const radioButtons = document.querySelectorAll('.rd');  // All radio buttons
  const headers = document.querySelectorAll('th.rt');     // Headers
  // Hard coded keys as they never change regardless of use case. 
  var varMap = {
    'Independent': [],
    'Dependent': [],
    'Categorical': []
  }

  var head = [];
  for (let i = 0; i < headers.length; ++i) {
    head[i] = headers[i].textContent;
  }
  var j = 0;
  // Loops through radio buttons 
  for (let x = 0; x < radioButtons.length; x++) {
    if (radioButtons[x].checked && radioButtons[x].value == 'Ind') {
      varMap['Independent'].push(head[j]);
      j++;
    }
    if (radioButtons[x].checked && radioButtons[x].value == 'Dep') {
      varMap['Dependent'].push(head[j]);
      j++;
    }
    if (radioButtons[x].checked && radioButtons[x].value == 'Cat') {
      // Decrements becase a categorical variable is always ALSO ind or dep.   
      j--;
      varMap['Categorical'].push(head[j]);
      j++;
      // Increments so order is not messed up.
    }
  }
  console.log(varMap);
}

There is also some checkbox logic implemented such that a variable cannot be both independent and dependent and that there can only ever be one dependent variable in any inputted dataset. This was done in jQuery.

In [None]:
$(document).ready(function () {
  $('input.dep:radio').change(function() {
      // When any radio button on the page is selected,
      // then deselect all other radio buttons.
      $('input.dep:radio:checked').not(this).prop('checked', false);
  });
})

### 2.3. Passing Variable Mapping to be Pre Processed

The variable mapping is created in JavaScript dnyamically using the users input. It is then passed to the python backend using AJAX:

In [None]:
$.ajax({
    url: '/dataPreProcessing',
    type: "POST",
    contentType: "application/json", 
    data: JSON.stringify(s)
  }).done(function(result){     // on success get the return object from server
    console.log(result)     // see it in the console to test its working 
})

### 2.4. Pre Processing the Data 

Once the variable mapping is created in the JavaScript, it is passed into the flask backend which takes the original file the user submitted, as well as the newly created variable mapping, passing both of them as arguments to the data pre-processing function. 

In [None]:
# Invoked when user submits variable mapping 
@views.route('/dataPreProcessing', methods=['POST'])
def dataPre():
    # result is the variable mapping in a JSON format
    result =  request.get_json()

    # Dataset and variable mapping to be passed into the data
    # pre-processing function
    varMap = json.loads(result)
    file = os.path.join(UPLOAD_FOLDER, filename)

    table = DPP.dataPreProcess(file, varMap)

    # Getting the individual components of pre processed data 
    # to keep a reference to them for when they need to be passed 
    # in to the selected algorithm.
    xTest = pd.DataFrame(table['X_test'])
    xTrain = pd.DataFrame(table['X_train'])
    yTest = pd.DataFrame(table['y_test'])
    yTrain = pd.DataFrame(table['y_train'])   

    # Creating variables to store file names and locations for pre 
    # processed data locations
    fN_xT = '/home/user/Documents/git/QuickML/pre_processed_data/xTest'
    fN_xTr = '/home/user/Documents/git/QuickML/pre_processed_data/xTrain'
    fN_yT = '/home/user/Documents/git/QuickML/pre_processed_data/yTest'
    fN_yTr = '/home/user/Documents/git/QuickML/pre_processed_data/yTrain'

    # pd.to_csv creates the file if it does not exist, but it does not 
    # create any non existent directories. The pre_processed_data directory 
    # already exists, pd.to_csv <i>creates</i> the files and populates them 
    # with the contents of their respective components. 
    xTest.to_csv(fN_xT)
    xTrain.to_csv(fN_xTr)
    yTest.to_csv(fN_yT)
    yTrain.to_csv(fN_yTr)

    # Getting the file out of the whole path and converting it to a dataframe.
    dF = pd.read_csv(file.split('/')[-1])
    
    # Columns still hard coded! Fix before deploying to production. 
    col = dF.columns

    # return formattes string which contains HTML and HTML tables using 
    # the 'tabulate' module
    return (f'''
            <h2 style="text-align:center">Scroll to Preview your Pre-Processed Data!</h2>
            <hr>
            <div>
                <h3 style="text-align:left"> X train </h3> 
                <h3 style="text-align:right; margin-top:-40px"> Y train </h3> <hr><br>
                <div class="container" style="display:flex; width=70%">
                    {tabulate(table['X_train'], tablefmt='html', headers = col)}
                    {tabulate(table['y_train'], tablefmt='html', headers = col[4:])}
                </div>
                <hr>
                <h3 style="text-align:left"> X test </h3> 
                <h3 style="text-align:right; margin-top:-40px"> Y test </h3> <hr><br>
                <div class="container" style="display:flex; width=70%">
                    {tabulate(table['X_test'], tablefmt='html', headers = col)}
                    {tabulate(table['y_test'], tablefmt='html', headers = col[4:])}
                </div>
            </div>
    ''' )

It also writed 4 csv files each containing one of the components of the pre-processed data:
1. X_train
2. y_train 

These are datasets which will be used to train the Machine Learning/Deep Learning model. 

3. X_test
4. y_train

These are the datasets which are given to the ML/DL model to test it's accruacy. Based on these results, the confusion matrix is created. 

It is also important to keep a reference to the users choice of algorithm so that the correct one is invoked. This is sent from the JavaScript to the Flask backend, which then writes it to a text file: 

In [None]:
// JavaScript Code 
document.querySelectorAll('.choice').forEach(item => {
    item.addEventListener("click", event => {
      var value = item.value;
        
      // Users choice 
      let option = value
      console.log(option)
      // Open and send information to Flask
      const request = new XMLHttpRequest()
      request.open('POST', `/ProcessOption/${JSON.stringify(option)}`)
      request.send();
    })
  })

In [None]:
# Flask Backend Code 
@views.route('/ProcessOption/<string:option>', methods=['POST'])
def SaveOption(option):
    sel = json.loads(option)

    with open("choice.txt", "w") as fo:
        fo.write(sel)
    return 1

This marks the end of the data pre-processing section. Now the algorithms can be analyzed.

## 3. Regression 

### 3.1 Simple Linear Regression 

A Simple Linear Regression is a machine learning model used on data sets with 2 columns, one independent and one dependent variable. Below is the code for the algorithm: 

In [None]:
# SIMPLE LINEAR REGRESSION
def simpleLinearRegression(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    """
    Takes the train and test split of the dataset, as well as name
    of the uploaded dataSet. Fits a regressor and plots a simple
    linear regression on the dataset. Saves the figure and returns path
    to saved figure as jpg.
    """
    
    regressor = LinearRegression()
    regressor.fit(Xtrain, Ytrain)   

    plt.title(f'Linear Regression for Dataset: {dataSet}')

    plt.scatter(Xtest[:,:].transpose()[1:,:].tolist()[0], 
                Ytest[:,:].transpose()[1:,:].tolist()[0], 
                color='blue',   
                label='Test Samples')
    
    plt.scatter(Xtrain[:,:].transpose()[1:,:].tolist()[0],
                Ytrain[:,:].transpose()[1:,:].tolist()[0],
                color='red', 
                label = 'Train Samples')
  
    XTest_Plot = Xtest[:,:].transpose()[1:,:].tolist()[0]
    (Ytrain[:,:].transpose()[1:,:].tolist()[0])

    Ytrain_temp = regressor.predict(Xtest) 

    YTrain_Hat_Plot = Ytrain_temp.transpose()[1:,:].tolist()[0]

    plt.plot(sorted(XTest_Plot),
             sorted(YTrain_Hat_Plot),
             label='Regression line')

    plt.legend()
    plt.grid()

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'/home/user/Documents/git/QuickML/webapp/static/{filename}.jpg')

    x = f'/home/user/Documents/git/QuickML/webapp/static/{filename}.jpg'

    return x 

    # The function returns the path to the figure, which can then be viewed
    # by the user.

### 3.2 Multivariate Linear Regression

Multivariate linear Regression is the same as a simple linear regression except it accepts multiple independent variables as well as one independent variable. Below is the code for the Multivariate Linear Regression API:

In [None]:
# MULTIPLE LINEAR REGRESSION 
def multipleLinearRegression(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. It saves the graph as a figure and returns it 
    to be later displayed in the html.  
    '''

    # Fitting Multiple Linear Regression to Training Set 
    regressor = LinearRegression()
    regressor.fit(Xtrain, Ytrain)

    # Test set prediction 
    Ypred = regressor.predict(Xtest)
    Ypred2 = regressor.predict(Xtrain)

    plt.title(f'Multivariate Linear Regression for Dataset: {dataSet}')

     # Adding train and test plot 
    train_plot = plt.subplot(121)
    test_plot = plt.subplot(122)
    # Setting size of figure
    mpl.rcParams['figure.figsize'] = [10, 10]

    # Formatting the predictions for plotting
    YTest_Hat_Plot = Ypred.transpose()[1:,:].tolist()[0]
    YTrain_Hat_Plot = Ypred2.transpose()[1:,:].tolist()[0]

    # Train set Plotted 
    train_plot.grid(True)
    train_plot.set_title('Train Set')

    # Scattering Actual Train Set
    train_plot.scatter(Xtrain[:,:].transpose()[1:,:].tolist()[0], 
                       Ytrain[:,:].transpose()[1:,:].tolist()[0],
                       color ='Orange',
                       label = 'Actual Train Set')    

    # Scattering Predicted Train Set 
    train_plot.scatter(Xtrain[:,:].transpose()[1:,:].tolist()[0], 
                       YTrain_Hat_Plot,
                       color ='Green',
                       label = 'Predicted Train Set')
    train_plot.legend()    

    # Test set  Plotted
    test_plot.grid(True)
    test_plot.set_title('Test Set')

    test_plot.scatter(Xtest[:,:].transpose()[1:,:].tolist()[0], 
                      Ytest[:,:].transpose()[1:,:].tolist()[0],
                      color ='Red',
                      label = 'Actual Test Set')
    test_plot.scatter(Xtest[:,:].transpose()[1:,:].tolist()[0], 
                      YTest_Hat_Plot,
                      color = 'Blue',
                      label = 'Predicted Test Set') 
    test_plot.legend()          

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

The test and train sets are both scattered on the graph, and their respective predictions are scattered as well.

### 3.3 Polynomial Linear Regression

Polynomial Linear Regression is similar to Multivariate Linear Regression but should be u sed when the dataset appears to contain non-linear associations. It is still called a 'linear' regression becase the coefficients are linear. 

A polynomial linear regression ignores certain parameters, therefore the variable mapping had to be changed to take into account the ignored variables. The user selects which attributes are to be ignored, and these are then dynamically dropped from their dataset. 

Below is the code for the polynomial linear regression:

In [None]:
# POLYNOMIAL LINEAR REGRESSION 
def polynomialLinearRegression(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    
    # Formatting the dataSets for analysis
    XTrain = Xtrain[:,:].transpose()[1:,:].tolist()[0]
    XTest = Xtest[:,:].transpose()[1:,:].tolist()[0]
    YTrain = Ytrain[:,:].transpose()[1:,:].tolist()[0]
    YTest = Ytest[:,:].transpose()[1:,:].tolist()[0]

    # PLR doesn't need a train test split, so the individual components 
    # are combined to form the original dataset (except now its pre-processed)
    X_combined = np.r_[XTrain, XTest]
    Y_combined = np.r_[YTrain, YTest]

    
    X_combined = np.array(X_combined, dtype='int')
    Y_combined = np.array(Y_combined, dtype='int')

    # Might not be necessary... delete if deemed unworthy. 
    lin_reg = LinearRegression()
    lin_reg.fit(X_combined.reshape(-1,1), Y_combined)

    # Creating a polynomial regressor
    poly_reg = PolynomialFeatures(degree=3)
    # Transforming X from just X to X + its polynomial terms
    X_Comb_Poly = X_combined.reshape(-1,1)
    X_poly = poly_reg.fit_transform(X_Comb_Poly)

    # New linear regression fitted onaugmented X matrix and 
    # original Y vector. 
    lin_reg2 = LinearRegression()
    lin_reg2.fit(X_poly, Y_combined)

    plt.title(f'Polynomial Linear Regression for {dataSet}')

    # Scattering actual results
    plt.scatter(X_combined, Y_combined, color = 'red', label ='Actual')

    # Plotting predicted values via linear regression
    plt.plot(X_combined, 
             lin_reg.predict(X_combined.reshape(-1,1)), 
             color = 'blue', 
             label = 'Linear')

    print(f'{X_poly}============')

    X_combined_Plot = X_combined.tolist()
    X_poly_Plot = X_poly.tolist()

    # Plotting predicted values via polynomial regression
    plt.plot(sorted(X_combined_Plot),
             lin_reg2.predict(sorted(X_poly_Plot)), 
             # lin_reg2.predict(poly_reg.fit_transform(X_combined.reshape(-1,1))), 
             color = 'green', 
             label = 'Poylnomial')


    plt.legend()          

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

Note that the train and test sets have been re-merged so as to properly perform the polynomial linear regression.

### 3.4 Support Vector Regression

Similar to Polynomial Linear Regression. A kernel is chosen and the algorithm is invoked accordingly. Below is the code for the SVR API: 

In [1]:
# SUPPORT VECTOR REGRESSION 
def supportVectorRegression(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    
    mpl.rcParams['figure.figsize'] = [11, 6]

    # Formatting the dataSets for analysis
    XTrain = Xtrain[:,:].transpose()[1:,:].tolist()[0]
    XTest = Xtest[:,:].transpose()[1:,:].tolist()[0]
    YTrain = Ytrain[:,:].transpose()[1:,:].tolist()[0]
    YTest = Ytest[:,:].transpose()[1:,:].tolist()[0]

    # SVR doesn't need a train test split, so the individual components 
    # are combined to form the original dataset (except now its pre-processed)
    X_combined = np.r_[XTrain, XTest]
    Y_combined = np.r_[YTrain, YTest]

    #==========================================================#

    sc_X = StandardScaler()
    sc_Y = StandardScaler()
    X_combined = sc_X.fit_transform(X_combined.reshape(-1,1))
    Y_combined = sc_Y.fit_transform(Y_combined.reshape(-1,1))

    # Creating an SVR regressor
    svr_reg = SVR(kernel='poly')
    svr_reg.fit(X_combined, Y_combined)


    plt.title(f'Support Vector Regression for {dataSet}')

    # Scattering actual results
    plt.scatter(X_combined, Y_combined, color = 'blue', label ='Actual')

    X_combined_Plot = X_combined.tolist()

    # Plotting predicted values via linear regression
    plt.plot(sorted(X_combined_Plot), 
             sorted(svr_reg.predict(X_combined)), 
             color = 'orange', 
             label = 'Support Vector')


    plt.legend()          

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

## 4. Classification 

### 4.1 K Nearest Neighbours 

A classification algorithm which returns a confusion matrix to illustrate the accuracy and loss ofthe algorithm:

In [None]:
# KNN
def K_Nearest_Neighbours(Xtest, Xtrain, Ytest, Ytrain, dataSet):

    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))
    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Creating KNN Classifier & Fitting on to Train Set 
    classifier_K = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    classifier_K.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    # Preditcing the Test set Results
    YPred = classifier_K.predict(XTest.reshape(-1,1))

    # Making the Confusion Matrix
    cm = confusion_matrix(YTest, YPred)
 

    clf = SVC(random_state = 0)
    clf.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    plot_confusion_matrix(clf, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'K Nearest Neighbours Classification for {dataSet}')

    plt.legend() 

    # plt.matshow(dcm)


            
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

### 4.2 Support Vector Machine (SVM)

Given two classes of points, SVM will find a line to separate the two of them and then classify any new data points depending on which side of the line they fall on. The way it picks the line is it uses the max-margin method. The line whih has the maximum equidistant margin from two points will be chosen as the final line. The two points from which it is equidistant act as the support points (or support vectors) and hence the name of this ML model: Support Vector Machine. 

In [None]:
# Support Vector Machine (SVM)
def Support_Vector_Machine(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. Performs Support Vector Machine classification
    on the given dataset and returns a confusion matrix to display the accuracy
    and loss of the ML model. 
    '''
    
    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))
    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Create SVM Classifier 
    classifier = SVC(kernel = 'linear', random_state=0)

    # Fit Classifier on to Data 
    classifier.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    YPred = classifier.predict(XTest.reshape(-1,1))

    # Making the Confusion Matrix
    cm = confusion_matrix(YTest, YPred)
 

    clf = SVC(random_state = 0)
    clf.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    # Plotting the Confusion Matrix 
    plot_confusion_matrix(clf, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'Support Vector Classification for {dataSet}')

    plt.legend() 
    
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

### 4.3 Kernel Support Vector Machine

This algorithm is used when SVM won't do. SVM only works when the data is already linearly seperable, when that is not the case, K-SVM should be used. Using a gaussian kernel, K-SVM maps the input dataset to an additional dimension so that the data inputted becomes linearly seperable. 

In [None]:
# Kernel-Support Vector Machine (SVM)
def Kernel_Support_Vector_Machine(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. Performs Kernel-Support Vector Machine 
    classification on the given dataset and returns a confusion matrix 
    to display the accuracy and loss of the ML model. 
    '''
    
    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))
    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Creating K-SVM Classifier with Gaussian Kernel
    classifier = SVC(kernel='rbf', random_state=0)

     # Fit Classifier on to Data 
    classifier.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    YPred = classifier.predict(XTest.reshape(-1,1))

    # Making the Confusion Matrix
    # cm = confusion_matrix(YTest, YPred)
 
    print('here=================================================')

    clf = SVC(random_state = 0)
    clf.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    # Plotting the Confusion Matrix 
    plot_confusion_matrix(clf, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'Kernel-Support Vector Classification for {dataSet}')

    plt.legend() 
    
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

### 4.4 Naive Bayes

Naive Bayes is a probabilistic algorithm that’s typically used for classification problems. Naive Bayes is simple, intuitive, and yet performs surprisingly well in many cases.

In [None]:
# Naive Bayes
def Naive_Bayes(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. Performs Naive Bayes classification
    on the given dataset and returns a confusion matrix to display the accuracy
    and loss of the ML model. The classification is based on Bayes theoreum.
    '''
    
    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))
    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Creating Naive Bayes Classifier
    classifier = GaussianNB()

    # Fit classifier to training set
    classifier.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    YPred = classifier.predict(XTest.reshape(-1,1))

    # Making the Confusion Matrix
    cm = confusion_matrix(YTest, YPred)
 
    # Plotting the Confusion Matrix 
    plot_confusion_matrix(classifier, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'Naive Bayes Classification for {dataSet}')

    plt.legend() 
    
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

### 4.5 Decision Tree Classification

Decision tree builds classification or regression models in the form of a tree structure. It breaks down a dataset into smaller and smaller subsets while at the same time an associated decision tree is incrementally developed. The final result is a tree with decision nodes and leaf nodes.

In [None]:

# Decision Tree Classification
def Decision_Tree_Classfication(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. Performs Decision Tree classification
    on the given dataset and returns a confusion matrix to display the accuracy
    and loss of the ML model. 
    '''

    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))

    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Creating a Decision Tree Classifer with a 0 random state
    classifier = DecisionTreeClassifier(random_state=0)

     # Fit Classifier on to Data 
    classifier.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    # Plotting the Confusion Matrix 
    plot_confusion_matrix(classifier, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'Decision Tree Vector Classification for {dataSet}')

    plt.legend() 
    
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

### 4.6 Random Forest Classification

An ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. For classification tasks, the output of the random forest is the class selected by most trees. For regression tasks, the mean or average prediction of the individual trees is returned.

In [None]:
# Random Forst Classification
def Random_Forest_Classfication(Xtest, Xtrain, Ytest, Ytrain, dataSet):
    '''
    Takes pre processed data and the dataSet which expects the algorithm
    to be placed on its data. Performs Random Forest classification
    on the given dataset and returns a confusion matrix to display the accuracy
    and loss of the ML model. 
    '''

    # Formatting the dataSets for analysis
    XTrain = np.array(Xtrain[:,:].transpose()[1:,:].tolist()[0])
    XTest = np.array(Xtest[:,:].transpose()[1:,:].tolist()[0])
    YTrain = np.array(Ytrain[:,:].transpose()[1:,:].tolist()[0])
    YTest = np.array(Ytest[:,:].transpose()[1:,:].tolist()[0])

    # Manually casting to int using labelEncoder class to preserve
    # data integrity. (better than .astype('int'))

    lab_enc = preprocessing.LabelEncoder()

    XTrain = lab_enc.fit_transform(XTrain)
    XTest = lab_enc.fit_transform(XTest)
    YTrain = lab_enc.fit_transform(YTrain)
    YTest = lab_enc.fit_transform(YTest)

    # Creating a Decision Tree Classifer with a 0 random state
    classifier = RandomForestClassifier(random_state=0)

     # Fit Classifier on to Data 
    classifier.fit(XTrain.reshape(-1,1), YTrain.reshape(-1,1))

    # Plotting the Confusion Matrix 
    plot_confusion_matrix(classifier, XTest.reshape(-1,1), YTest.reshape(-1,1))

    plt.title(f'Random Forest Classification for {dataSet}')

    plt.legend() 
    
    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x

## 5. Clustering

### 5.1 K-Means Clutering

K-means clustering is a method of vector quantization, originally from signal processing, that aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean.

In [None]:
#K-Means Clustering
def kMeansClustering(Xtest, Xtrain, Ytest, Ytrain, dataSet):

    # Formatting the dataSets for analysis
    XTrain = Xtrain[:,:].transpose()[1:,:].tolist()[0]
    XTest = Xtest[:,:].transpose()[1:,:].tolist()[0]
    YTrain = Ytrain[:,:].transpose()[1:,:].tolist()[0]
    YTest = Ytest[:,:].transpose()[1:,:].tolist()[0]


    X_combined = np.r_[XTrain, XTest]
    Y_combined = np.r_[YTrain, YTest]

    # Manually casting to int 
    X_combined = np.array(X_combined, dtype='int')
    Y_combined = np.array(Y_combined, dtype='int')

    # Within Cluster sum of Squares
    wcss = []
    for i in range(1, 11):
        kMeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kMeans.fit(X_combined.reshape(-1,1))
        wcss.append(kMeans.inertia_)

    plt.plot(range(1,11), wcss)
    plt.title(f'K-Means Clustering for {dataSet}')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x 

### 5.2 Hierarchical Clustering

Hierarchical clustering is an algorithm that groups similar objects into groups called clusters. The endpoint is a set of clusters, where each cluster is distinct from each other cluster, and the objects within each cluster are broadly similar to each other.

In [None]:
#Hierarchical Clustering
def hierarchicalClustering(Xtest, Xtrain, Ytest, Ytrain, dataSet):

    # Formatting the dataSets for analysis
    XTrain = Xtrain[:,:].transpose()[1:,:].tolist()[0]
    XTest = Xtest[:,:].transpose()[1:,:].tolist()[0]
    YTrain = Ytrain[:,:].transpose()[1:,:].tolist()[0]
    YTest = Ytest[:,:].transpose()[1:,:].tolist()[0]


    X_combined = np.r_[XTrain, XTest]
    Y_combined = np.r_[YTrain, YTest]

    # Manually casting to int 
    X_combined = np.array(X_combined, dtype='int')
    Y_combined = np.array(Y_combined, dtype='int')

    #Using a Dendrogram to find the optimal number of clusters
    dendrogram = sch.dendrogram(sch.linkage(X_combined.reshape(-1,1), method='ward'))
    # The ward method aims to minimize the variance among the clusters
   
    plt.title(f'Hierarchical Clustering for {dataSet}')
    # plt.xlabel('Number of Clusters')
    plt.ylabel('Euclidean Distances')

    filename = f'{random.randint(100,999)}'
    plt.savefig(f'../QuickML/webapp/static/{filename}.jpg')

    x = f'../QuickML/webapp/static/{filename}.jpg'

    return x 

## 6. Deep Learning