## Training a Logistic Regression Model and selecting best hyperparameters using grid-search cv and dropping categorical columns

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
df.head()

In [None]:
df_clean = df.drop(columns=["MachineID", "DateAS", "DateOS"], errors="ignore")

X = df_clean.drop(columns=["target"])

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
X = X[numeric_cols]  

y = df_clean["target"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler())  
])

pipeline = Pipeline([
    ("preprocessor", num_pipeline),
    ("classifier", LogisticRegression(max_iter=500))  
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10], 
    "classifier__solver": ["liblinear", "lbfgs"]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Cross-Validation Accuracy:", grid_search.best_score_)


## Submitting Logistic regression model

In [None]:
X_train.shape

In [None]:
X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
X_test = X_test[numeric_cols]

In [None]:
X_test.shape

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred

In [None]:
y_pred.shape

In [None]:
submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})

In [None]:
submission.shape

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv',index = False)

## Logistic Regression without categorical data 2

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
df.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

df_clean = df.drop(columns=["MachineID", "DateAS", "DateOS"], errors="ignore")

X = df_clean.drop(columns=["target"])

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
X = X[num_cols]  

y = df_clean["target"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler())  
])

pipeline = Pipeline([
    ("preprocessor", num_pipeline),
    ("classifier", LogisticRegression(max_iter=500))  
])

param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10], 
    "classifier__solver": ["liblinear", "lbfgs"]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X, y)

print("Best Cross-Validation Accuracy:", grid_search.best_score_)

In [None]:
X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")
X_test = X_test[num_cols]
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred

In [None]:
submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})
submission.head()

In [None]:
submission.to_csv('submission.csv',index = False)

## Logistic Regression using categorical data with encoding

In [None]:
# import pandas as pd
# df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
# df.head()

In [None]:
# X_train = df.drop(columns = ['target'])
# y_train = df['target']

In [None]:
# X_train.head()

In [None]:
# X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
# X_test.head()

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.feature_selection import VarianceThreshold
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.compose import ColumnTransformer

In [None]:
# categorical_columns = X_train.select_dtypes(include=['object']).columns
# numerical_columns = X_train.select_dtypes(exclude=['object']).columns

In [None]:
# preprocessor = ColumnTransformer([
#         ('num', Pipeline([
#             ('imputer', SimpleImputer(strategy='mean')),  
#             ('variance_threshold', VarianceThreshold(threshold=0.01)) 
#         ]), numerical_columns),
#         ('cat', Pipeline([
#             ('imputer', SimpleImputer(strategy='most_frequent')),  
#             ('onehot', OneHotEncoder(handle_unknown='ignore'))  
#         ]), categorical_columns)
# ])

In [None]:
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),  # Apply preprocessor
#     ('log_reg', LogisticRegression(max_iter=1000, random_state=42))  # Logistic Regression model
# ])

In [None]:
# param_grid = {
#     'log_reg__C': [0.01, 0.1, 1, 10],
#     'log_reg__solver': ['lbfgs', 'liblinear'],
#     'log_reg__penalty': ['l2'],
#     'log_reg__max_iter': [100, 200, 300]
# }

# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [None]:
# y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
# submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})

In [None]:
# submission.to_csv('submission.csv',index = False)

## Random Forest Classifier with Grid Search

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler , OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV

In [None]:
# X_train = df.drop(columns = ['target'])
# y_train = df['target']

In [None]:
# X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
# categorical_cols = X_train.select_dtypes(include=['object']).columns
# numerical_cols = X_train.select_dtypes(exclude=['object']).columns

In [None]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', Pipeline([
#             ('imputer', SimpleImputer(strategy='mean')),
#             ('scaler', StandardScaler())                
#         ]), numerical_cols),
#         ('cat', Pipeline([
#             ('imputer', SimpleImputer(strategy='most_frequent')),
#             ('onehot', OneHotEncoder(handle_unknown='ignore'))   
#         ]), categorical_cols)
# ])

In [None]:
# rf_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
# ])

In [None]:
# param_grid = {
#     'classifier__n_estimators': [100, 200, 300],            
#     'classifier__max_depth': [None, 10, 20, 30],            
#     'classifier__min_samples_split': [2, 5, 10],            
#     'classifier__min_samples_leaf': [1, 2, 4],               
#     'classifier__max_features': ['sqrt', 'log2'],    
#     'classifier__class_weight': [None, 'balanced']           
# }

# grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
# grid_search.fit(X_train,y_train)

In [None]:
# y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
# submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})

In [None]:
# submission.to_csv('submission.csv',index = False)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
X_train = df.drop(columns = ["target"])
y_train = df["target"]
X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
cat_features = X_train.select_dtypes(include=["object"]).columns
num_features = X_train.select_dtypes(exclude=["object"]).columns

In [None]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

In [None]:
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [None]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])


In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
y_pred = rf_pipeline.predict(X_test)

In [None]:
submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})

In [None]:
submission.to_csv('submission.csv',index = False)

## Trying a basic XGBoost Model

In [None]:
# import pandas as pd
# import xgboost as xgb

# df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")

# X_train = df.drop(columns = ["target"])
# X_train[X_train.select_dtypes('object').columns] = X_train.select_dtypes('object').astype('category')

# y_train = df["target"]

# X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

# model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42 , enable_categorical = True)
# model.fit(X_train, y_train)
# X_test[X_test.select_dtypes('object').columns] = X_test.select_dtypes('object').astype('category')
# y_pred = model.predict(X_test)

In [None]:
# submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})
# submission.to_csv("/kaggle/working/submission.csv", index=False)

In [None]:
# submission.head()

## XGBoost Last model

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer

In [None]:
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
X_train = df.drop(columns = ["target"])
y_train = df["target"]
X_test = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")

In [None]:
columns_to_drop = ['MachineID', 'DateAS', 'DateOS']
X_train.drop(columns=columns_to_drop, inplace=True, errors='ignore')
X_test.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [None]:
num_cols = [
    'IsBetaUser', 'RealTimeProtectionState', 'IsPassiveModeEnabled', 'AntivirusConfigID',
    'NumAntivirusProductsInstalled', 'NumAntivirusProductsEnabled', 'HasTpm', 'CountryID', 'CityID', 'GeoRegionID',
    'LocaleEnglishNameID', 'OSBuildNumber', 'OSProductSuite', 'IsSystemProtected', 'AutoSampleSubmissionEnabled',
    'SMode', 'IEVersionID', 'FirewallEnabled', 'EnableLUA', 'OEMNameID', 'OEMModelID', 'ProcessorCoreCount',
    'ProcessorManufacturerID', 'ProcessorModelID', 'PrimaryDiskCapacityMB', 'SystemVolumeCapacityMB',
    'HasOpticalDiskDrive', 'TotalPhysicalRAMMB', 'PrimaryDisplayDiagonalInches', 'PrimaryDisplayResolutionHorizontal',
    'PrimaryDisplayResolutionVertical', 'InternalBatteryNumberOfCharges', 'OSBuildNumberOnly', 'OSBuildRevisionOnly',
    'OSInstallLanguageID', 'OSUILocaleID', 'IsPortableOS', 'IsFlightsDisabled', 'FirmwareManufacturerID',
    'FirmwareVersionID', 'IsSecureBootEnabled', 'IsVirtualDevice', 'IsTouchEnabled', 'IsPenCapable',
    'IsAlwaysOnAlwaysConnectedCapable', 'IsGamer', 'RegionIdentifier'
]
cat_cols = [
    'ProductName', 'EngineVersion', 'AppVersion', 'SignatureVersion', 'PlatformType', 'Processor', 'OSVersion',
    'OsPlatformSubRelease', 'OSBuildLab', 'SKUEditionName', 'MDC2FormFactor', 'DeviceFamily', 'PrimaryDiskType',
    'ChassisType', 'PowerPlatformRole', 'NumericOSVersion', 'OSArchitecture', 'OSBranch', 'OSEdition',
    'OSSkuFriendlyName', 'OSInstallType', 'AutoUpdateOptionsName', 'OSGenuineState', 'LicenseActivationChannel',
    'FlightRing'
]

In [None]:
def cap_outliers(X, factor=1.5):
    X_capped = X.copy()
    for col_idx in range(X.shape[1]):  # Iterate over all columns
        col = X[:, col_idx]
        Q1 = np.quantile(col, 0.25)
        Q3 = np.quantile(col, 0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        X_capped[:, col_idx] = np.clip(col, lower_bound, upper_bound)
    return X_capped

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_capper', FunctionTransformer(
        cap_outliers, 
        kw_args={'factor': 1.5}
    )),  
    ('scaler', MinMaxScaler())  
])

In [None]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('encoder', OrdinalEncoder(
        handle_unknown='use_encoded_value',  
        unknown_value=-1  
    ))
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
class_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=class_ratio,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)


In [None]:
model.fit(X_train_processed, y_train)

In [None]:
y_pred = model.predict(X_test_processed)

In [None]:
submission = pd.DataFrame({"id": range(len(y_pred)), "target": y_pred})
submission.to_csv("/kaggle/working/submission.csv", index=False)

In [None]:
y_pred

In [None]:
submission = pd.DataFrame({"id":range(0,X_test.shape[0]),"target":y_pred})
submission.head()

In [None]:
submission.to_csv('submission.csv',index = False)

## Milestone 3

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler ,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
X = df.drop(columns = ["target"])
y = df["target"]

In [21]:
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [22]:
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")) 
])

In [23]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [24]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [25]:
X_train , X_test ,y_train , y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
dt_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [27]:
dt_param_grid = {
    "model__max_depth": [20, 30],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2]
}

## Milestone 5

In [2]:
import pandas as pd
df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
df.head()

Unnamed: 0,MachineID,ProductName,EngineVersion,AppVersion,SignatureVersion,IsBetaUser,RealTimeProtectionState,IsPassiveModeEnabled,AntivirusConfigID,NumAntivirusProductsInstalled,...,IsSecureBootEnabled,IsVirtualDevice,IsTouchEnabled,IsPenCapable,IsAlwaysOnAlwaysConnectedCapable,IsGamer,RegionIdentifier,DateAS,DateOS,target
0,f541bae429089117c4aac39c90dd3416,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1003.0,0,7.0,0,53447.0,1.0,...,0,0.0,1,0,1.0,0.0,6.0,2018-09-10 10:11:00,2018-04-17,0
1,dc2b14d9ce3a0ce4050bb640190f2ca5,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1465.0,0,7.0,0,53447.0,1.0,...,1,0.0,0,0,0.0,0.0,10.0,2018-08-16 00:01:00,2018-08-14,1
2,fd20c5f010e9c5f91ad1c6b3e0da68a0,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1546.0,0,7.0,0,53447.0,1.0,...,0,0.0,0,0,0.0,1.0,6.0,2018-09-20 23:20:00,2018-09-11,1
3,38711eae85eb77a72ec5dfdf27eb2a76,win8defender,1.1.15200.1,4.12.17007.18011,1.275.1141.0,0,7.0,0,46413.0,2.0,...,1,0.0,0,0,0.0,0.0,12.0,2018-09-14 00:32:00,2018-01-03,1
4,32607c9a543a9214e2c7e45800ed4849,win8defender,1.1.15200.1,4.13.17134.228,1.275.1283.0,0,7.0,0,40466.0,2.0,...,0,0.0,0,0,0.0,1.0,7.0,2018-09-15 19:34:00,2018-09-11,0


In [13]:
X = df.drop(columns=['target'])  
y = df['target']

In [14]:
from sklearn.impute import SimpleImputer

categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

num_imputer = SimpleImputer(strategy='mean')
X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [16]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

cm_dt = confusion_matrix(y_test, y_pred)
correctly_classified = cm_dt.trace() 
incorrect = cm_dt[1, 0]  

precision = precision_score(y_test, y_pred, pos_label=0)

print(correctly_classified)
print(incorrect)
print(precision)

10881
4500
0.5388871810636335


In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import recall_score

adaboost_model = AdaBoostClassifier(n_estimators=10, learning_rate=10, random_state=42)
adaboost_model.fit(X_train, y_train)

y_pred_ada = adaboost_model.predict(X_test)

cm_ada = confusion_matrix(y_test, y_pred_ada)
correctly_classified = cm_ada.trace()
incorrect = cm_ada[1, 0]

recall = recall_score(y_test, y_pred_ada)

print(correctly_classified)
print(incorrect)
print(recall)

10122
0
1.0


In [24]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

cm_lr = confusion_matrix(y_test, y_pred_lr)
correctly_classified = cm_lr.trace()
incorrect = cm_lr[0, 1]

recall = recall_score(y_test, y_pred_lr)

print(correctly_classified)
print(incorrect)
print(recall)

10705
7022
0.7754396364354871
