<a href="https://colab.research.google.com/github/Timoh-Mbata/MACHINE-LEARNING-CHECKPOINT/blob/main/CHECKPOINT_MACHINE_LEARNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install ydata-profiling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,6)
import warnings
warnings.filterwarnings(action='ignore')
import re
from ydata_profiling import ProfileReport

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedGroupKFold
from sklearn.linear_model import LogisticRegressionCV,Lasso,Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import tensorflow
from tensorflow import keras

# DATA PREPROCESSING

In [None]:
dataset = pd.read_csv(r'/content/Microsoft_malware_dataset_min.csv')
print(f'\n {dataset.sample(3)}')
print('\n the tail')
print(f'\n {dataset.tail(3)}')

In [None]:
print('the dataset columns')
dataset.columns

In [None]:
dataset.isnull().sum()

In [None]:
dataset.dtypes

In [None]:
corr = dataset.select_dtypes(include=['int','float']).corr()
sns.heatmap(corr,annot=True)
plt.show()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
sns.pairplot(dataset)

# Handling imputations

In [None]:
dataset.isnull().sum()

In [None]:
dataset.IsProtected.value_counts()

In [None]:
# we are going to perform a foward fill on the column because the destrict values are 0 and 1
dataset['IsProtected'] = dataset['IsProtected'].ffill(axis=0)

In [None]:
dataset['IsProtected'].ffill(axis=0)

In [None]:
dataset.SMode.value_counts()

In [None]:
dataset[dataset['SMode']==1]

In [None]:
# conclusion you realise that when Firewall is 0 , Census_HasOpticalDiskDrive is 0 and is protected = 0 the Smode is 1;
dataset.loc[
    (dataset['Firewall'] == 0) &
    (dataset['Census_HasOpticalDiskDrive'] == 0) &
    (dataset['IsProtected'] == 0) &
    (dataset['SMode'].isnull()),  # check for NaN
    'SMode'
] = 1  # filling the null values with 1

In [None]:
dataset['SMode'].fillna(0,axis=0,inplace=True)

In [None]:
dataset.Census_IsVirtualDevice.value_counts()

In [None]:
dataset[dataset['Census_IsVirtualDevice']==1].sample(50)

In [None]:
dataset.loc[
(dataset['SMode']==0) &
(dataset['Census_IsVirtualDevice'].isnull()),
'Census_IsVirtualDevice'
] = 0

In [None]:
dataset.Census_IsVirtualDevice.fillna(1,axis=0,inplace=True)

In [None]:
dataset.Firewall.value_counts()

In [None]:
dataset['Firewall'].ffill(axis=0,inplace=True)

In [None]:
dataset['Wdft_IsGamer'].ffill(axis=0,inplace=True)

In [None]:
dataset['Wdft_IsGamer'].ffill(axis=0,inplace=True)

In [None]:
report = ProfileReport(dataset)
report

In [None]:
dataset.shape

In [None]:
dataset.drop_duplicates(inplace=True)

In [None]:
dataset.shape

In [None]:
dataset.isnull().sum()

In [None]:
dataset.columns

In [None]:
dataset.dtypes

In [None]:
dataset.Census_OSEdition.nunique()

In [None]:
dataset.OsPlatformSubRelease.unique()

# FEATURE SELECTION

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['OsPlatformSubRelease'] = le.fit_transform(dataset['OsPlatformSubRelease'])
dataset['Census_OSEdition'] = le.fit_transform(dataset['Census_OSEdition'])

In [None]:
dataset.head(4)

In [None]:
x = dataset.drop(columns=['HasDetections'])
y = dataset['HasDetections']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

# MODEL BUILDING

In [None]:
model_params = {
    'LOGISTIC_REGRESSORCV': {
        'model': LogisticRegressionCV(),
        'params': {
            'random_state': [42],
            'solver': ['liblinear', 'lbfgs'],
            'Cs': [1, 10, 100]
        }
    },
    'RANDOM_FORESTCV': {
        'model': RandomForestClassifier(),
        'params': {
            'random_state': [42],
            'n_estimators': [50, 100, 250, 1000],
            'max_depth': [3, 6, 9]
        }
    },
    'SUPPORT_VECTORSCV': {
        'model': SVC(),
        'params': {
            'C': [1, 5, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    }
}

In [None]:
score = []
for model_name, model_info in model_params.items():
    cv = GridSearchCV(model_info['model'], model_info['params'], cv=5, return_train_score=False)
    cv.fit(X_train, y_train)
    score.append({
        'model': model_name,
        'model_params': cv.best_params_,
        'model_score': cv.best_score_
    })
pd.DataFrame(score)