## 2.4.2 Based on the 300 features selected by ANOVA F-test.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("permission-based_malware_2.csv") #  read a CSV file named into a DataFrame called df
target = "CLASS" # target variable or outcome variable, which represents the variable that you're trying to predict or model.

In [3]:
df.shape # This will output a tuple with two elements: the number of rows and the number of columns in the DataFrame.

(1168, 942)

In [4]:
df.info() # info() method in pandas DataFrame provides a concise summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Columns: 942 entries, NAME to CLASS
dtypes: int64(941), object(1)
memory usage: 8.4+ MB


In [5]:
df[target].value_counts() #  print the counts of each unique value in the target column of the DataFrame.

CLASS
1    602
0    566
Name: count, dtype: int64

In [6]:
X = df.loc[: , df.columns!=target] # X will contain all the features (columns) of the DataFrame df except for the target column
X = X.loc[: , X.columns!="NAME"]
y = df[target] # y will contain the target column, which represents the target variable.

# Feature selection

In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def anova_selection(X, y, num_of_feat):
    fs = SelectKBest(score_func=f_classif, k=num_of_feat)
    fit = fs.fit(X, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Best_columns', 'Score_anova']

    lyst = featureScores.nlargest(num_of_feat, 'Score_anova')

    anova_features = list(lyst['Best_columns'])
    return anova_features

# Load your dataset
df = pd.read_csv("permission-based_malware_2.csv")
target = "CLASS"

X = df.loc[:, df.columns != target]
X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(50, len(X.columns), 50):
    print("Number of columns: ", i)
    anova_list = anova_selection(X, y, i)

    X_ = df.loc[:, anova_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=186, max_depth=6, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")

Number of columns:  50


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9102564102564102
Training Accuracy:  0.9550321199143469
___________________________________________________________________________________________
Number of columns:  100


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.905982905982906
Training Accuracy:  0.9700214132762313
___________________________________________________________________________________________
Number of columns:  150


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9017094017094017
Training Accuracy:  0.9785867237687366
___________________________________________________________________________________________
Number of columns:  200


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.905982905982906
Training Accuracy:  0.9817987152034261
___________________________________________________________________________________________
Number of columns:  250


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9145299145299145
Training Accuracy:  0.9860813704496788
___________________________________________________________________________________________
Number of columns:  300


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9316239316239316
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  350


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  400


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  450


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9316239316239316
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  500


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9316239316239316
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  550


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9230769230769231
Training Accuracy:  0.9882226980728052
___________________________________________________________________________________________
Number of columns:  600


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  650


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  700


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________
Number of columns:  750


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9925053533190579
___________________________________________________________________________________________
Number of columns:  800


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9925053533190579
___________________________________________________________________________________________
Number of columns:  850


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9273504273504274
Training Accuracy:  0.9903640256959315
___________________________________________________________________________________________
Number of columns:  900


  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


Testing Accuracy:  0.9316239316239316
Training Accuracy:  0.9892933618843683
___________________________________________________________________________________________


## Function Call

In [8]:
selected_feature_list = anova_selection(X,y,300) 
selected_feature_list

  29  30  31  33  37  38  39  40  41  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  75  76  77  78  79  80  82  86  88  89  90  92  93  95  96
  97 100 106 108 109 110 112 113 115 116 117 118 119 123 125 126 127 129
 130 132 133 135 137 138 139 140 141 142 143 144 146 147 150 151 152 154
 156 157 159 164 166 167 169 170 172 173 174 175 176 178 179 180 181 182
 183 184 185 186 187 189 190 192 193 194 195 199 200 201 203 204 205 206
 207 209 210 211 212 213 215 216 217 218 219 220 222 225 226 227 230 235
 238 240 241 242 243 244 245 246 247 248 249 250 251 252 256 257 259 260
 261 262 263 265 266 267 268 269 272 273 275 277 278 279 284 285 286 287
 288 290 292 295 297 298 299 301 302 303 305 307 308 309 314 315 316] are constant.
  f = msb / msw


['android.permission.SEND_SMS',
 'android.permission.RECEIVE_SMS',
 'android.permission.READ_SMS',
 'android.permission.RECEIVE_BOOT_COMPLETED',
 'android.permission.READ_PHONE_STATE',
 'android.permission.SYSTEM_ALERT_WINDOW',
 'android.permission.WRITE_SMS',
 'android.permission.CHANGE_NETWORK_STATE',
 'android.permission.GET_TASKS',
 'android.permission.READ_CONTACTS',
 'android.permission.FOREGROUND_SERVICE',
 'android.permission.CHANGE_WIFI_STATE',
 'android.permission.CALL_PHONE',
 'android.permission.DISABLE_KEYGUARD',
 'android.permission.WRITE_SETTINGS',
 'android.permission.WAKE_LOCK',
 'android.permission.REQUEST_IGNORE_BATTERY_OPTIMIZATIONS',
 'android.permission.REQUEST_DELETE_PACKAGES',
 'com.android.vending.BILLING',
 'android.permission.INSTALL_PACKAGES',
 'android.permission.MOUNT_UNMOUNT_FILESYSTEMS',
 'android.permission.ACTION_MANAGE_OVERLAY_PERMISSION',
 'com.google.android.c2dm.permission.RECEIVE',
 'android.permission.ACCESS_WIFI_STATE',
 'com.google.android.fins

In [9]:
# Selecting only the features that were identified as significant through the Univariate Feature Selection
X = df.loc[:,selected_feature_list] # Selects columns from the DataFrame df and stores them in the variable X
y = df[target] # Assigns the target variable, which is typically the variable you are trying to predict

In [10]:
X

Unnamed: 0,android.permission.SEND_SMS,android.permission.RECEIVE_SMS,android.permission.READ_SMS,android.permission.RECEIVE_BOOT_COMPLETED,android.permission.READ_PHONE_STATE,android.permission.SYSTEM_ALERT_WINDOW,android.permission.WRITE_SMS,android.permission.CHANGE_NETWORK_STATE,android.permission.GET_TASKS,android.permission.READ_CONTACTS,...,android.permission.FORCE_STOP_PACKAGES,com.huawei.launcher2.permission.READ_SETTINGS,android.permission.RAISED_THREAD_PRIORITY,com.lenovo.launcher.permission.WRITE_SETTINGS,android.permission.PROCESS_INCOMING_CALLS,com.huawei.launcher3.permission.READ_SETTINGS,android.permission.USE_BIOMETRIC,android.permission.ACCESS_LOCATION_EXTRA_COMMANDS,android.permission.BIND_ACCESSIBILITY_SERVICE,android.permission.READ_SYNC_STATS
0,1,1,1,1,1,0,1,1,0,1,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1164,1,1,1,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1165,1,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1166,1,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.model_selection import train_test_split # Imports the train_test_split function from scikit-learn's model_selection module.

#  Splits features (X) and target variable (y) into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [12]:
y_train.value_counts()

CLASS
1    476
0    458
Name: count, dtype: int64

# BruteForce Tuning for RandomForest

## Default Param

In [13]:
from sklearn.ensemble import RandomForestClassifier # Importing RandomForestClassifier from scikit-learn.
rf_default = RandomForestClassifier() # Creating an instance of RandomForestClassifier with default hyperparameters.
rf_default.fit(X_train, y_train) # Fitting the RandomForestClassifier model to the training data (X_train, y_train)
y_pred_test=rf_default.predict(X_test) # Using the trained RandomForestClassifier model (rf_default) to make predictions on the test data (X_test).

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Importing necessary evaluation metrics from scikit-learn.

# Print confusion matrix, classification report, and accuracy
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
print("\nAccuracy:", accuracy_score(y_test, y_pred_test))

Confusion Matrix:
[[103   5]
 [ 13 113]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       108
           1       0.96      0.90      0.93       126

    accuracy                           0.92       234
   macro avg       0.92      0.93      0.92       234
weighted avg       0.93      0.92      0.92       234


Accuracy: 0.9230769230769231


In [14]:
# Creating an instance of the RandomForestClassifier class
rf_all = RandomForestClassifier(n_estimators=11,max_depth=26,random_state=0) 
rf_all.fit(X_train, y_train) # trains (fits) the RandomForestClassifier model on the training data.
y_pred=rf_all.predict(X_test) # uses the trained RandomForestClassifier model (rf_all) to make predictions on the test data (X_test).
print("Accurecy: ",accuracy_score(y_test,y_pred))

Accurecy:  0.905982905982906


In [15]:
rf_all = RandomForestClassifier(n_estimators=11,max_depth=26,random_state=0)
rf_all.fit(X_train, y_train)
y_pred=rf_all.predict(X_train) # predicts the target labels (y_pred) for the training data (X_train) using the trained rf_all model.
print("Accurecy: ",accuracy_score(y_train,y_pred))

Accurecy:  0.9764453961456103


# BruteForce Tuning for GradientBoosting

## Default Param

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gb_default = GradientBoostingClassifier()
gb_default.fit(X_train, y_train)
y_pred_test=gb_default.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Print confusion matrix, classification report, and accuracy
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
print("\nAccuracy:", accuracy_score(y_test, y_pred_test))

Confusion Matrix:
[[105   3]
 [ 17 109]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       108
           1       0.97      0.87      0.92       126

    accuracy                           0.91       234
   macro avg       0.92      0.92      0.91       234
weighted avg       0.92      0.91      0.91       234


Accuracy: 0.9145299145299145


In [17]:
gb = GradientBoostingClassifier(n_estimators=186,max_depth=6,random_state=0)
gb.fit(X_train, y_train)
y_pred=gb.predict(X_test)
print("Accurecy: ",accuracy_score(y_test,y_pred))

Accurecy:  0.9316239316239316


In [18]:
gb = GradientBoostingClassifier(n_estimators=186,max_depth=6,random_state=0)
gb.fit(X_train, y_train)
y_pred =gb.predict(X_train)
print("Accurecy: ",accuracy_score(y_train,y_pred))

Accurecy:  0.9892933618843683


# BruteForce Tuning for XGBoost

## Default Param

In [19]:
import xgboost as xgb
xg_default = xgb.XGBClassifier() # Create an instance of XGBoost classifier
xg_default.fit(X_train, y_train) # Train the classifier
y_pred_test = xg_default.predict(X_test) # Predict on the test data

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Print confusion matrix, classification report, and accuracy
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
print("\nAccuracy:", accuracy_score(y_test, y_pred_test))

Confusion Matrix:
[[102   6]
 [ 13 113]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       108
           1       0.95      0.90      0.92       126

    accuracy                           0.92       234
   macro avg       0.92      0.92      0.92       234
weighted avg       0.92      0.92      0.92       234


Accuracy: 0.9188034188034188


In [20]:
xg = xgb.XGBClassifier(n_estimators=41,max_depth=6,random_state=0)
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test)
print("Accurecy: ",accuracy_score(y_test,y_pred))

Accurecy:  0.9273504273504274


In [21]:
xg = xgb.XGBClassifier(n_estimators=41,max_depth=6,random_state=0)
xg.fit(X_train, y_train)
y_pred = xg.predict(X_train)
print("Accurecy: ",accuracy_score(y_train,y_pred))

Accurecy:  0.9668094218415417


# BruteForce Tuning for boost-histogram

## Default Param

In [22]:
from sklearn.ensemble import HistGradientBoostingClassifier
hg_default = HistGradientBoostingClassifier() 
hg_default.fit(X_train, y_train) 
y_pred_test = hg_default.predict(X_test) 

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
print("\nAccuracy:", accuracy_score(y_test, y_pred_test))

Confusion Matrix:
[[101   7]
 [ 12 114]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       108
           1       0.94      0.90      0.92       126

    accuracy                           0.92       234
   macro avg       0.92      0.92      0.92       234
weighted avg       0.92      0.92      0.92       234


Accuracy: 0.9188034188034188


In [23]:
hg = HistGradientBoostingClassifier(max_iter=51,max_depth=16,random_state=0)
hg.fit(X_train, y_train)
y_pred = hg.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))

Accuracy:  0.9188034188034188


In [24]:
hg = HistGradientBoostingClassifier(max_iter=51,max_depth=16,random_state=0)
hg.fit(X_train, y_train)
y_pred = hg.predict(X_train)
print("Accuracy: ",accuracy_score(y_train,y_pred))

Accuracy:  0.9518201284796574
