# Team information
|S. No|Name|email|
|-|-|-|
|1|Mathews Roy|psymr3@nottingham.ac.uk|
|2|Ewan Ross|psyer1@nottingham.ac.uk|
|3|Soham Talukdar|ppxst3@nottingham.ac.uk|
|4|Srushanth Baride|ppxsb5@nottingham.ac.uk|

# Resources used
[pyspark: Extracting, transforming and selecting features](https://spark.apache.org/docs/latest/ml-features)</br>
[sklearn mutual_info_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html)</br>
[sklearn chi2](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html?highlight=chi2#sklearn.feature_selection.chi2)

# Data processing & plot libraries

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn import metrics
from tabulate import tabulate
from datetime import timedelta
import matplotlib.pyplot as plt

# pyspark ML libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import UnivariateFeatureSelector
from pyspark.ml.feature import ChiSqSelector, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier as pyspark_DecisionTreeClassifier

# sklearn ML libraries

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_regression
from sklearn.tree import DecisionTreeClassifier as sklearn_DecisionTreeClassifier

In [None]:
# Creating a new spark session
spark = SparkSession.builder.master("local[*]").appName("MLlib lab").getOrCreate()

In [None]:
# Path to the Leukemia csv file
_leukemia_dataset_file = "/mnt/the-data-transformers/Leukemia_GSE9476.csv"

In [None]:
'''
Read data from Leukemia csv file
Max columns are set to 22285
Header is set to True as the csv file contains a header
'''
sparkDF = spark.read.option("maxColumns", 22285).csv(_leukemia_dataset_file, header=True)

In [None]:
# Converting spark DataFrame to pandas DataFrame for easy processing
pandasDF = sparkDF.toPandas()
pandasDF.head()

Unnamed: 0,samples,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,1438_at,1487_at,1494_f_at,1598_g_at,160020_at,1729_at,1773_at,177_at,179_at,1861_at,200000_s_at,200001_at,200002_at,200003_s_at,200004_at,200005_at,200006_at,200007_at,200008_s_at,200009_at,200010_at,200011_s_at,200012_x_at,200013_at,200014_s_at,200015_s_at,200016_x_at,200017_at,...,AFFX-LysX-M_at,AFFX-M27830_3_at,AFFX-M27830_5_at,AFFX-M27830_M_at,AFFX-PheX-3_at,AFFX-PheX-5_at,AFFX-PheX-M_at,AFFX-r2-Bs-dap-3_at,AFFX-r2-Bs-dap-5_at,AFFX-r2-Bs-dap-M_at,AFFX-r2-Bs-lys-3_at,AFFX-r2-Bs-lys-5_at,AFFX-r2-Bs-lys-M_at,AFFX-r2-Bs-phe-3_at,AFFX-r2-Bs-phe-5_at,AFFX-r2-Bs-phe-M_at,AFFX-r2-Bs-thr-3_s_at,AFFX-r2-Bs-thr-5_s_at,AFFX-r2-Bs-thr-M_s_at,AFFX-r2-Ec-bioB-3_at,AFFX-r2-Ec-bioB-5_at,AFFX-r2-Ec-bioB-M_at,AFFX-r2-Ec-bioC-3_at,AFFX-r2-Ec-bioC-5_at,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-Hs18SrRNA-3_s_at,AFFX-r2-Hs18SrRNA-5_at,AFFX-r2-Hs18SrRNA-M_x_at,AFFX-r2-Hs28SrRNA-3_at,AFFX-r2-Hs28SrRNA-5_at,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,Bone_Marrow_CD34,7.74524500119505,7.81121007534134,6.47791635228852,8.84150572057594,4.5469405821895,7.95771415034381,5.34499901190722,4.67336385511613,4.66492407696435,4.0696243853938,7.07428341012836,7.43807285615198,6.82183168083122,8.36021094261399,7.58774421042457,7.23850067108989,5.73440122891825,5.86572093709168,9.29510364279376,6.63699583167985,9.75554096112187,10.2125037592264,12.4072510631567,12.9772829451237,10.9494104971535,10.6114859992921,11.3820531662505,11.3428916400888,10.3024054117681,11.28586083841,12.5233250319206,9.14086773983838,12.8443282271805,12.5966271258493,10.3265400494895,9.21371508610917,12.6044684133243,12.3732711676939,...,4.56810259616506,7.61953363806961,8.34338888891647,9.7325175304411,6.09929797188769,4.00051144053717,3.78706799496998,3.62947372237495,3.91346154545267,3.82770159268882,4.02977682303784,3.76819617607289,4.09208790442887,4.3890216416854,3.89877941641535,4.09723211007773,4.51770015184237,5.36489656987136,4.28055238311876,7.46681436568113,7.07133516607174,7.80009077238972,8.66845438091539,8.68213083954868,10.7064933156398,10.8338650824032,7.72691937010064,7.78971294759834,6.32169978256436,9.61423038707479,5.05884906355157,6.81000356185228,12.8000601994153,12.7186115474035,5.39151157002182,4.66616566659989,3.97475949012144,3.65669311996003,4.16062155445183,4.13924884553532
1,12,Bone_Marrow_CD34,8.08725200850986,7.24067331083678,8.58464840530855,8.9835712260236,4.54893416449282,8.01165205655946,5.57964654533655,4.82818367944698,5.17183525709646,4.29987530696569,7.142593637847,7.52439087254722,7.13038907536095,8.41571429416201,7.58387260965911,7.31387005928921,5.97608435809263,6.16470994790605,9.446879444909,6.59987212106388,9.42375402925316,10.010800545985,12.4112763737065,12.7584342618747,10.4515031583186,10.1768351079433,10.6661052363202,10.7482831854979,9.70525247873508,10.8609362415014,11.9002552907414,8.87945392257804,12.775879673495,12.3702245325833,9.62232802727872,8.92203663914082,12.3001249211742,11.9834172623061,...,4.63979155692597,7.43993608320862,7.43925373897001,9.42579421129795,6.10773008838127,4.12010603519148,3.83482621636727,3.79332299583567,3.8056081256028,3.71303638021025,4.63218716052627,3.74274573383988,4.19386422436012,4.73000410653151,3.82446524836006,4.01356134285356,4.58505780521766,5.2825917469801,4.29560470836821,7.12181903219694,6.79467969148148,7.54675447918691,8.61932295025616,8.77950473589525,10.0786165644472,10.064735211129,6.97836035256138,6.60425566516114,5.53143171927693,8.04843784954171,4.43615259371806,6.75147094561877,12.4727057814784,12.3335932023584,5.37973825546594,4.65678572080881,4.18834812377709,3.79253460224534,4.2044138400184,4.12269983763514
2,13,Bone_Marrow_CD34,7.79205557755844,7.54936775520293,11.053504051356,8.90970256236097,4.54932764667886,8.23709920681109,5.40648931670832,4.61557197116622,4.77570863437093,4.14836336994092,7.07847160700501,7.59399992894581,6.9795583755533,8.44339533977746,7.64149559172346,7.17456050336123,5.95576928535164,6.03318837400037,9.34103675360888,6.18127796815632,9.85641410609214,10.4274376931306,12.3573736273559,12.705206179494,11.0325275910147,10.4907207081896,10.8521781256771,10.9689822271543,10.0711288534275,11.1064055589694,12.2283915065571,9.04162096649058,12.7298985273788,12.4833517267962,9.7231350573522,9.44438374568604,12.4382890584936,12.1390899832743,...,4.58792836701933,7.45242089921527,6.06737395040743,9.15706445497547,6.14319053007101,4.02541709295029,3.75813504414899,3.80871656173354,3.83220996422785,3.7383255803542,4.1027772483511,3.60632340547405,4.12952124467474,4.60900740372503,3.69116063180779,4.08574400564718,4.47897451011795,5.13318563170261,4.31681695554338,7.39169236847354,6.86557505511674,7.62299922838811,8.62890522812406,8.69350120754393,10.6237869335947,10.677363223546,7.09425724170035,6.13659565597598,5.09707437473784,6.46766851262663,4.39206061104228,6.08629542334857,12.6373840031888,12.4990375684941,5.31660434964732,4.60056619131529,3.84556141253046,3.63571499403405,4.17419911761064,4.06715189177125
3,14,Bone_Marrow_CD34,7.76726466863097,7.09446004508613,11.8164333464643,8.9946544133444,4.697018225266,8.28341179301214,5.58219459657822,4.90368448782678,4.82984360979774,4.07549383376332,7.19661717318745,7.69327032883008,7.07056516716235,8.60365795819115,7.82405403500637,7.1673708127293,6.06140329303219,5.91030509219223,9.38023742985381,6.01066697944839,9.48719234078634,10.0290033184881,12.2737408257126,12.8139172726164,10.9542040792533,10.7318667767293,11.1614607987322,10.9249302398535,9.8493886878541,11.3034997792357,12.3664271954564,8.81679600959437,12.7480155206638,12.6254642907138,9.60997725678194,9.22651866132997,12.5225404143385,12.1826114286278,...,4.61718820507364,7.85147636604127,6.68279391569737,9.76343051455222,6.45215239938105,4.17468033762452,4.07643514837084,3.88953645269351,3.8040422506116,3.72322852057827,4.04766166047092,3.80241201595626,3.97989036447862,4.56031116222802,3.68298275884523,4.20792591966885,4.64375737301378,5.24903177088255,4.38530448621545,8.11543573369272,7.7537500603029,8.49890819424651,9.77851956264085,9.97058107708337,11.1025642653836,11.3967008935686,7.39236479874729,6.11953447821019,5.45397498906731,7.21890397647587,4.63333402111035,6.37599124562392,12.9036297504518,12.8714539953803,5.17995144927903,4.64195164611807,3.99163406360053,3.70458705244103,4.14993831446596,3.9101497118226
4,15,Bone_Marrow_CD34,8.01011677792405,7.40528111947315,6.65604868129464,9.05068187803522,4.51498576534719,8.37704633235136,5.493713480124,4.86075384755779,5.24504945079212,4.05207692094415,7.1570696899323,7.64061505809919,7.20558285897521,8.60064675115285,7.70150130543936,7.05710318370555,5.87672644837064,5.8452624080297,9.29781003600266,6.16177076070711,9.33631731195455,9.72564077663148,12.3817455031045,12.9033858733425,10.8830259034356,10.5274062240766,10.9485662485616,10.9631367967683,10.1074895861276,11.1182633262887,12.3327550776681,8.90711288887104,12.8057224926877,12.5779595812684,9.66095133142712,9.21927641415546,12.4522489603886,12.1352118839762,...,4.58179008039791,7.68882071904775,8.79232261211742,10.1528370256383,5.9589116223931,3.95731481091698,3.99013769194152,3.65670372757778,3.83930491330172,3.72118065532895,3.97696673208843,3.83930491330172,3.99067819291877,4.34148406503252,3.88832736561286,4.07734874473025,4.39824309009117,5.26793079128395,4.36548989790735,7.54065144779017,7.19180661946103,7.76364558527208,9.16431725920415,9.11063869248467,10.9192206412114,10.7972972509615,8.6813514456684,7.68361487633206,6.34531055632495,8.82719226369142,5.30519187665136,6.70045349076342,12.9493522755966,12.7825147956718,5.34168947217255,4.56031502631325,3.88702024770358,3.62985266486149,4.12751339464474,4.00431605125524


In [None]:
X = sparkDF.drop('type')
y = sparkDF.select('type')

In [None]:
X_rows = np.array(X.collect()).astype(float)
y_rows = np.array(y.collect()).astype(str)

In [None]:
X_rows = preprocessing.scale(np.array(X_rows, dtype=object))

In [None]:
'''
Function: isNaN
INPUT:
------
num: any type

OUTPUT:
-------
1. boolean value (true if null)
'''

def isNaN(num):
    return num != num

In [None]:
valid_X = []
valid_y = []

rowID = 0

for instance in X_rows:
  valid = True
  rowID += 1
  
  for value in instance:
    if isNaN(value):
      valid = False
      print(f"row {rowID} is invalid")
      break

  if valid:
    valid_X.append(instance)
    valid_y.append(y_rows[rowID-1])

In [None]:
preprocessed_X = pd.DataFrame(valid_X)
preprocessed_y = pd.DataFrame(valid_y)
preprocessed_X['output'] = preprocessed_y

[Code reference](https://medium.com/analytics-vidhya/removing-outliers-understanding-how-and-what-behind-the-magic-18a78ab480ff)

In [None]:
'''
Function: check_outliers_std
INPUT:
------
dataframe: spark dataframe
col_name. column name 

OUTPUT:
-------
1. boolean value, true if outliers exist
'''

def check_outliers_std(dataframe, col_name):
    lower_boundary, upper_boundary = determine_outlier_thresholds_std(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_boundary) | (dataframe[col_name] < lower_boundary)].any(axis=None):
        return True
    else: 
        return False

In [None]:
'''
Function: determine_outlier_thresholds_std
INPUT:
------
dataframe: spark dataframe
col_name: column name

OUTPUT:
-------
1. lower and upper bound (float values)
'''

def determine_outlier_thresholds_std(dataframe, col_name):
    upper_boundary = dataframe[col_name].mean() + 2 * dataframe[col_name].std()
    lower_boundary = dataframe[col_name].mean() - 2 * dataframe[col_name].std()
    return lower_boundary, upper_boundary

In [None]:
'''
Function: sklearn_mutual_info_regression
INPUT:
------
dataframe: spark dataframe
cols: columns
replace: boolean flag (replace values if true)

OUTPUT:
-------
1. no return values; replace outliers with lower/upper limit values
'''

def replace_with_thresholds_std(dataframe, cols, replace=False):
    total = 0
    data = []
    
    for col_name in cols:
          
        if col_name != 'output':
            outliers_ = check_outliers_std(dataframe, col_name)
            count = None
            lower_limit, upper_limit = determine_outlier_thresholds_std(dataframe, col_name)
            
            if outliers_:
                count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
                
                if replace:
                    if lower_limit < 0:
                        # We don't want to replace with negative values.
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
                    else:
                        dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
                        
            outliers_status = check_outliers_std(dataframe, col_name)
            data.append([outliers_, outliers_status,count, col_name, lower_limit, upper_limit])
            
            if count:
              total = total + count
            
    print(f"Replaced (Total): {total}")
            
    table = tabulate(data, headers=['Outlier (Previously)','Outliers','Count', 'Column','Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
    print("Removing Outliers using 2 Standard Deviation")

In [None]:
_columns = list(pandasDF.columns)
_columns.remove("type")
_columns.append("type")

In [None]:
preprocessed_X.set_axis(_columns, axis=1, inplace=True)

In [None]:
# Converting string categorical to numerical
LE = LabelEncoder()
preprocessed_X['type'] = LE.fit_transform(preprocessed_X['type'])
preprocessed_X['type'] = preprocessed_X['type'].astype('int32')
pandasDF = preprocessed_X 

[Curse of Dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)

In [None]:
# Getting the DataFrame shape
# The data is suffering the 'curse of dimensionality' as the no.of features are exponentially greater than the no.of samples
pandasDF.shape

In [None]:
bone_marrow_type = pandasDF["type"].unique()

In [None]:
# Converting pandas DataFrame to spark DataFrame
sparkDF = spark.createDataFrame(pandasDF)

# Converting spark DataFrame columns to float
sparkDF = sparkDF.select(*(F.col(c).cast("float").alias(c) for c in sparkDF.columns))

In [None]:
# Vectorising the spark DataFrame for easier processing
vecAssembler = VectorAssembler(inputCols=sparkDF.columns, outputCol="features")
vector_sparkDF = vecAssembler.transform(sparkDF)

In [None]:
# Setting features split
features_mul = 4000

# spark [ChiSqSelector](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.ChiSqSelector.html)
[ChiSqSelector](https://george-jen.gitbook.io/data-science-and-apache-spark/chisqselector) stands for Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the Chi-Squared test of independence to decide which features to choose.

In [None]:
'''
Function: spark_ChiSqSelector
INPUT:
------
i: Number of Features to select
_vector_sparkDF: DataFrame from which the Features will be selected

OUTPUT:
-------
1. Model accuracy
2. Execution time for feature selection, building and evaluating the model
'''

def spark_ChiSqSelector(i, _vector_sparkDF):
  # Start time for execution time
  start_time = time.monotonic()
  
  # Selecting the best i features from the entire dataset
  selector = ChiSqSelector(
    numTopFeatures=i, 
    featuresCol="features", 
    outputCol="selectedFeatures", 
    labelCol="type"
  )
  result = selector.fit(_vector_sparkDF).transform(_vector_sparkDF)
  print(f"Top {selector.getNumTopFeatures()} features selected")
  
  # Splitting the data into training & testing
  (train, test) = result.randomSplit([0.7, 0.3])
  
  # Using pyspark DecisionTreeClassifier to define and fit the ML model
  dt = pyspark_DecisionTreeClassifier(labelCol="type", featuresCol="selectedFeatures")
  model = dt.fit(train)
  
  # Make predictions
  predictions = model.transform(test)
  
  # Evaluating the predictions
  evaluator = MulticlassClassificationEvaluator(
    labelCol="type", 
    predictionCol="prediction", 
    metricName="accuracy"
  )
  accuracy = evaluator.evaluate(predictions)
  print(f"Test accuracy = {accuracy}")
  
  # End time for execution time
  end_time = time.monotonic()
  
  # Return accuracy and execution time
  return accuracy, timedelta(seconds=end_time - start_time).total_seconds()
  

# spark [UnivariateFeatureSelector](https://spark.apache.org/docs/latest/ml-features#univariatefeatureselector)
[UnivariateFeatureSelector](https://spark.apache.org/docs/latest/ml-features#univariatefeatureselector) operates on categorical/continuous labels with categorical/continuous features. User can set featureType and labelType, and Spark will pick the score function to use based on the specified featureType and labelType.

In [None]:
'''
Function: spark_UnivariateFeatureSelector
INPUT:
------
i: Number of Features to select
_vector_sparkDF: DataFrame from which the Features will be selected

OUTPUT:
-------
1. Model accuracy
2. Execution time for feature selection, building and evaluating the model
'''

def spark_UnivariateFeatureSelector(i, _vector_sparkDF):
  # Start time for execution time
  start_time = time.monotonic()
  
  # Selecting the best i features from the entire dataset
  selector = UnivariateFeatureSelector(
    featuresCol="features", 
    outputCol="selectedFeatures", 
    labelCol="type", 
    selectionMode="numTopFeatures"
  )
  selector.setFeatureType("continuous").setLabelType("categorical").setSelectionThreshold(i)
  result = selector.fit(_vector_sparkDF).transform(_vector_sparkDF)

  print("UnivariateFeatureSelector output with top %d features selected using f_classif" % selector.getSelectionThreshold())
  # result.show()
  
  # Splitting the data into training & testing
  (train, test) = result.randomSplit([0.7, 0.3])
  
  # Using pyspark DecisionTreeClassifier to define and fit the ML model
  dt = pyspark_DecisionTreeClassifier(labelCol="type", featuresCol="selectedFeatures")
  model = dt.fit(train)
  
  # Make predictions
  predictions = model.transform(test)
  
  # Evaluating the predictions
  evaluator = MulticlassClassificationEvaluator(
    labelCol="type", 
    predictionCol="prediction", 
    metricName="accuracy"
  )
  accuracy = evaluator.evaluate(predictions)
  print(f"Test accuracy = {accuracy}")
  
  # End time for execution time
  end_time = time.monotonic()
  
  # Return accuracy and execution time
  return accuracy, timedelta(seconds=end_time - start_time).total_seconds()
  

In [None]:
spark_acc_ChiSqSelector = []
spark_time_ChiSqSelector = []
spark_acc_UnivariateFeatureSelector = []
spark_time_UnivariateFeatureSelector = []

for i in range(2, len(vector_sparkDF.columns), features_mul):
  acc, exec_time = spark_ChiSqSelector(i, vector_sparkDF)
  spark_acc_ChiSqSelector.append(acc)
  spark_time_ChiSqSelector.append(exec_time)
  
  acc, exec_time = spark_UnivariateFeatureSelector(i, vector_sparkDF)
  spark_acc_UnivariateFeatureSelector.append(acc)
  spark_time_UnivariateFeatureSelector.append(exec_time)

# sklearn [chi2](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
[ChiSqSelector](https://george-jen.gitbook.io/data-science-and-apache-spark/chisqselector) stands for Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the Chi-Squared test of independence to decide which features to choose.

In [None]:
# X is features
X = pandasDF[pandasDF.columns.drop('type')]

# y is labels
y = pandasDF['type']

In [None]:
'''
Function: sklearn_chi2
INPUT:
------
i: Number of Features to select
_X: Features
_y: labels

OUTPUT:
-------
1. Model accuracy
2. Execution time for feature selection, building and evaluating the model
'''

def sklearn_chi2(i, _X, _y):
  # Start time for execution time
  start_time = time.monotonic()
  
  # Selecting the best i features from the entire dataset
  X_new = SelectKBest(chi2, k=i).fit_transform(_X, _y)
  
  # Splitting the data into training & testing
  train, test, train_labels, test_labels = train_test_split(X_new, _y, test_size=0.30, random_state=42)
  
  # Using sklearn DecisionTreeClassifier to define and fit the ML model
  clf = sklearn_DecisionTreeClassifier()
  clf = clf.fit(train, train_labels)
  
  # Make predictions
  predictions = clf.predict(test)
  
  # Evaluating the predictions
  accuracy = metrics.accuracy_score(test_labels, predictions)
  print(f"Test accuracy = {accuracy}")
  
  # End time for execution time
  end_time = time.monotonic()
  
  # Return accuracy and execution time
  return accuracy, timedelta(seconds=end_time - start_time).total_seconds()

# sklearn [mutual_info_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html?highlight=mutual_info_regression#sklearn.feature_selection.mutual_info_regression)
[Estimate mutual information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html?highlight=mutual_info_regression#sklearn.feature_selection.mutual_info_regression) for a continuous target variable.</br>
Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.</br>
The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances.

In [None]:
'''
Function: sklearn_mutual_info_regression
INPUT:
------
i: Number of Features to select
_X: Features
_y: labels

OUTPUT:
-------
1. Model accuracy
2. Execution time for feature selection, building and evaluating the model
'''

def sklearn_mutual_info_regression(i, _X, _y):
  # Start time for execution time
  start_time = time.monotonic()
  
  # Selecting the best i features from the entire dataset
  selector = SelectKBest(mutual_info_regression, k=i)
  selector.fit(_X, _y)
  
  X_new = X[X.columns[selector.get_support()]]
  
  # Splitting the data into training & testing
  train, test, train_labels, test_labels = train_test_split(X_new, _y, test_size=0.30, random_state=42)
  
  # Using sklearn DecisionTreeClassifier to define and fit the ML model
  clf = sklearn_DecisionTreeClassifier()
  clf = clf.fit(train, train_labels)
  
  # Make predictions
  predictions = clf.predict(test)
  
  # Evaluating the predictions
  accuracy = metrics.accuracy_score(test_labels, predictions)
  print(f"Test accuracy = {accuracy}")
  
  # End time for execution time
  end_time = time.monotonic()
  
  # Return accuracy and execution time
  return accuracy, timedelta(seconds=end_time - start_time).total_seconds()

In [None]:
sklearn_acc_chi2 = []
sklearn_time_chi2 = []
sklearn_acc_mutual_info_regression = []
sklearn_time_mutual_info_regression = []

for i in range(2, X.columns.size, features_mul):
  acc, exec_time = sklearn_chi2(i, X, y)
  sklearn_acc_chi2.append(acc)
  sklearn_time_chi2.append(exec_time)
  
  acc, exec_time = sklearn_mutual_info_regression(i, X, y)
  sklearn_acc_mutual_info_regression.append(acc)
  sklearn_time_mutual_info_regression.append(exec_time)

# Visualisation

In [None]:
x_axis = list([i for i in range(2, X.columns.size, features_mul)])

In [None]:
plt.plot(x_axis, spark_acc_ChiSqSelector, label='spark_ChiSqSelector')
plt.plot(x_axis, spark_acc_UnivariateFeatureSelector, label='spark_UnivariateFeatureSelector')
plt.plot(x_axis, sklearn_acc_chi2, label='sklearn_chi2')
plt.plot(x_axis, sklearn_acc_mutual_info_regression, label='sklearn_mutual_info_regression')
plt.ylim(0 , 1.1)
plt.title("Features vs Accuracy")
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(x_axis, spark_time_ChiSqSelector, label='spark_ChiSqSelector')
plt.plot(x_axis, sklearn_time_chi2, label='sklearn_chi2')
plt.plot(x_axis, spark_time_UnivariateFeatureSelector, label='spark_UnivariateFeatureSelector')
plt.plot(x_axis, sklearn_time_mutual_info_regression, label='sklearn_mutual_info_regression')
plt.title("Features vs Execution Time")
plt.xlabel('Number of Features')
plt.ylabel('Execution Time in Seconds')
plt.legend()
plt.show()

In [None]:
plt.plot(x_axis, spark_acc_ChiSqSelector, label='spark_ChiSqSelector')
plt.plot(x_axis, sklearn_acc_chi2, label='sklearn_chi2')
plt.ylim(0 , 1.1)
plt.title("Features vs Accuracy")
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(x_axis, spark_time_ChiSqSelector, label='spark_ChiSqSelector')
plt.plot(x_axis, sklearn_time_chi2, label='sklearn_chi2')
plt.title("Features vs Execution Time")
plt.xlabel('Number of Features')
plt.ylabel('Execution Time in Seconds')
plt.legend()
plt.show()

In [None]:
plt.plot(x_axis, spark_acc_UnivariateFeatureSelector, label='spark_UnivariateFeatureSelector')
plt.plot(x_axis, sklearn_acc_mutual_info_regression, label='sklearn_mutual_info_regression')
plt.ylim(0 , 1.1)
plt.title("Features vs Accuracy")
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(x_axis, spark_time_UnivariateFeatureSelector, label='spark_UnivariateFeatureSelector')
plt.plot(x_axis, sklearn_time_mutual_info_regression, label='sklearn_mutual_info_regression')
plt.title("Features vs Execution Time")
plt.xlabel('Number of Features')
plt.ylabel('Execution Time in Seconds')
plt.legend()
plt.show()