# Logistic Regression (Approach #1) for Microsoft Malware

In [2]:
import pandas as pd, numpy as np, os, gc

FE = ['EngineVersion','AppVersion','AvSigVersion','Census_OSVersion']

OHE = [ 'RtpStateBitfield','IsSxsPassiveMode','DefaultBrowsersIdentifier',
        'AVProductStatesIdentifier','AVProductsInstalled', 'AVProductsEnabled',
        'CountryIdentifier', 'CityIdentifier', 
        'GeoNameIdentifier', 'LocaleEnglishNameIdentifier',
        'Processor', 'OsBuild', 'OsSuite',
        'SmartScreen','Census_MDC2FormFactor',
        'Census_OEMNameIdentifier', 
        'Census_ProcessorCoreCount',
        'Census_ProcessorModelIdentifier', 
        'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName',
        'Census_HasOpticalDiskDrive',
        'Census_TotalPhysicalRAM', 'Census_ChassisTypeName',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_PowerPlatformRoleName', 'Census_InternalBatteryType',
        'Census_InternalBatteryNumberOfCharges',
        'Census_OSEdition', 'Census_OSInstallLanguageIdentifier',
        'Census_GenuineStateName','Census_ActivationChannel',
        'Census_FirmwareManufacturerIdentifier',
        'Census_IsTouchEnabled', 'Census_IsPenCapable',
        'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer',
        'Wdft_RegionIdentifier']


dtypes = {}
for x in FE+OHE: dtypes[x] = 'category'
dtypes['MachineIdentifier'] = 'str'
dtypes['HasDetections'] = 'int8'


df_train = pd.read_csv('train.csv', usecols=dtypes.keys(), dtype=dtypes)
print ('Loaded',len(df_train),'rows of TRAIN.CSV!')


sm = 900000
df_train = df_train.sample(sm)
print ('Only using',sm,'rows to train and validate')
x=gc.collect()

Loaded 8921483 rows of TRAIN.CSV!
Only using 900000 rows to train and validate


In [3]:
import math

def nan_check(x):
    if isinstance(x,float):
        if math.isnan(x):
            return True
    return False

def encode_FE(df,column,verbose=1):
    freq = df[column].value_counts(dropna=False)
    n = column+"_FE"
    df[n] = df[column].map(freq)/freq.max()
    if verbose==1:
        print('FE encoded',column)
    return [n]

def encode_OHE(df, col, filter, zvalue, tar='HasDetections', m=0.5, verbose=1):
    cv = df[col].value_counts(dropna=False)
    cvd = cv.to_dict()
    vals = len(cv)
    th = filter * len(df)
    sd = zvalue * 0.5/ math.sqrt(th)
    #print(sd)
    n = []; ct = 0; d = {}
    for x in cv.index:
        try:
            if cv[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cv[x])
        except:
            if cvd[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cvd[x])
        if nan_check(x): r = df[df[col].isna()][tar].mean()
        else: r = df[df[col]==x][tar].mean()
        if abs(r-m)>sd:
            nm = col+'_BE_'+str(x)
            if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
            else: df[nm] = (df[col]==x).astype('int8')
            n.append(nm)
            d[x] = 1
        ct += 1
        if (ct+1)>=vals: break
    if verbose==1:
        print('OHE encoded',col,'- Created',len(d),'booleans')
    return [n,d]


def encode_OHE_test(df,col,dt):
    n = []
    for x in dt: 
        n += encode_BE(df,col,x)
    return n


def encode_BE(df,col,val):
    n = col+"_BE_"+str(val)
    if nan_check(val):
        df[n] = df[col].isna()
    else:
        df[n] = df[col]==val
    df[n] = df[n].astype('int8')
    return [n]

In [4]:
cols = []; dd = []

# ENCODE NEW
for x in FE:
    cols += encode_FE(df_train,x)
for x in OHE:
    tmp = encode_OHE(df_train,x,0.005,5)
    cols += tmp[0]; dd.append(tmp[1])
print('Encoded',len(cols),'new variables')

# REMOVE OLD
for x in FE+OHE:
    del df_train[x]
print('Removed original',len(FE+OHE),'variables')
x = gc.collect()

FE encoded EngineVersion
FE encoded AppVersion
FE encoded AvSigVersion
FE encoded Census_OSVersion
OHE encoded RtpStateBitfield - Created 1 booleans
OHE encoded IsSxsPassiveMode - Created 0 booleans
OHE encoded DefaultBrowsersIdentifier - Created 1 booleans
OHE encoded AVProductStatesIdentifier - Created 8 booleans
OHE encoded AVProductsInstalled - Created 3 booleans
OHE encoded AVProductsEnabled - Created 2 booleans
OHE encoded CountryIdentifier - Created 34 booleans
OHE encoded CityIdentifier - Created 7 booleans
OHE encoded GeoNameIdentifier - Created 25 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded LocaleEnglishNameIdentifier - Created 20 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Processor - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded OsBuild - Created 6 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded OsSuite - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded SmartScreen - Created 4 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_MDC2FormFactor - Created 4 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_OEMNameIdentifier - Created 19 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_ProcessorCoreCount - Created 6 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_ProcessorModelIdentifier - Created 22 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_PrimaryDiskTotalCapacity - Created 12 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_PrimaryDiskTypeName - Created 3 booleans


  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_HasOpticalDiskDrive - Created 1 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_TotalPhysicalRAM - Created 9 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_ChassisTypeName - Created 7 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_InternalPrimaryDiagonalDisplaySizeInInches - Created 19 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_InternalPrimaryDisplayResolutionHorizontal - Created 3 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_InternalPrimaryDisplayResolutionVertical - Created 5 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_PowerPlatformRoleName - Created 2 booleans


  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_InternalBatteryType - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_InternalBatteryNumberOfCharges - Created 3 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_OSEdition - Created 4 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_OSInstallLanguageIdentifier - Created 16 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_GenuineStateName - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_ActivationChannel - Created 4 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_FirmwareManufacturerIdentifier - Created 13 booleans


  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_IsTouchEnabled - Created 1 booleans
OHE encoded Census_IsPenCapable - Created 0 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Census_IsAlwaysOnAlwaysConnectedCapable - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Wdft_IsGamer - Created 2 booleans


  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')
  else: df[nm] = (df[col]==x).astype('int8')


OHE encoded Wdft_RegionIdentifier - Created 12 booleans
Encoded 292 new variables
Removed original 43 variables


In [5]:
cols

['EngineVersion_FE',
 'AppVersion_FE',
 'AvSigVersion_FE',
 'Census_OSVersion_FE',
 'RtpStateBitfield_BE_0',
 'DefaultBrowsersIdentifier_BE_239',
 'AVProductStatesIdentifier_BE_53447',
 'AVProductStatesIdentifier_BE_7945',
 'AVProductStatesIdentifier_BE_47238',
 'AVProductStatesIdentifier_BE_62773',
 'AVProductStatesIdentifier_BE_23657',
 'AVProductStatesIdentifier_BE_49480',
 'AVProductStatesIdentifier_BE_41571',
 'AVProductStatesIdentifier_BE_22728',
 'AVProductsInstalled_BE_1',
 'AVProductsInstalled_BE_2',
 'AVProductsInstalled_BE_3',
 'AVProductsEnabled_BE_1',
 'AVProductsEnabled_BE_2',
 'CountryIdentifier_BE_141',
 'CountryIdentifier_BE_66',
 'CountryIdentifier_BE_89',
 'CountryIdentifier_BE_214',
 'CountryIdentifier_BE_158',
 'CountryIdentifier_BE_44',
 'CountryIdentifier_BE_107',
 'CountryIdentifier_BE_51',
 'CountryIdentifier_BE_68',
 'CountryIdentifier_BE_35',
 'CountryIdentifier_BE_160',
 'CountryIdentifier_BE_195',
 'CountryIdentifier_BE_155',
 'CountryIdentifier_BE_159',
 '

In [6]:
from sklearn.model_selection import train_test_split
#SPLIT TRAIN AND VALIDATION SET
X_dev, X_test, Y_dev, Y_test = train_test_split(
    df_train[cols], df_train['HasDetections'], test_size = 0.5, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_dev,Y_dev,random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

ss = StandardScaler()
ss.fit(X_train)

x_train = pd.DataFrame(ss.transform(X_train), columns = X_train.columns)
x_val = pd.DataFrame(ss.transform(X_val), columns = X_val.columns)
x_test = pd.DataFrame(ss.transform(X_test), columns = X_test.columns)

x_train = np.hstack([np.ones((x_train.shape[0], 1)), x_train])
x_val = np.hstack([np.ones((x_val.shape[0], 1)), x_val])
x_test = np.hstack([np.ones((x_test.shape[0], 1)), x_test])


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression(C=0.1, random_state=0)

# Train the model on the training data
lr_model.fit(x_train, Y_train)

y_train_pred = lr_model.predict(x_train)

# Make predictions on the validation data
y_val_pred = lr_model.predict(x_val)

# Compute training accuracy
train_accuracy = accuracy_score(Y_train, y_train_pred)

# Compute validation accuracy
val_accuracy = accuracy_score(Y_val, y_val_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Validation Accuracy: {val_accuracy}')

Training Accuracy: 0.6300474074074074
Validation Accuracy: 0.6293422222222222


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

param_grid = {'C': np.logspace(-3,3,5),
              'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
lr_model = LogisticRegression(random_state=0)
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model using GridSearchCV
grid_search.fit(x_train, Y_train)

# Get the best parameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_train_pred = best_model.predict(x_train)

# Make predictions on the validation data
y_val_pred = best_model.predict(x_val)

# Compute training accuracy
train_accuracy = accuracy_score(Y_train, y_train_pred)

# Compute validation accuracy
val_accuracy = accuracy_score(Y_val, y_val_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Validation Accuracy: {val_accuracy}')

print(f'Best Parameters: {best_params}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training Accuracy: 0.630077037037037
Validation Accuracy: 0.6293422222222222
Best Parameters: {'C': 1.0, 'penalty': 'l2'}


In [11]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression(penalty='elasticnet', solver='saga', random_state=0)

# Define the parameter grid for grid search
param_grid = {'C': np.logspace(-3,3,5),
              'l1_ratio': [0.3, 0.5, 0.7]}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model using GridSearchCV
grid_search.fit(x_train, Y_train)

# Get the best parameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_train_pred = best_model.predict(x_train)

# Make predictions on the validation data
y_val_pred = best_model.predict(x_val)

# Compute training accuracy
train_accuracy = accuracy_score(Y_train, y_train_pred)

# Compute validation accuracy
val_accuracy = accuracy_score(Y_val, y_val_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Validation Accuracy: {val_accuracy}')

print(f'Best Parameters: {best_params}')



Training Accuracy: 0.6300177777777778
Validation Accuracy: 0.62952
Best Parameters: {'C': 0.03162277660168379, 'l1_ratio': 0.3}
