In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Loading the dataset to a dataframe named 'data'
data = pd.read_csv('train.csv')
X = data.drop(columns=['target'])
y = data['target']

In [None]:
#Milestone 1

# Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns

print("Dataset Info:")
data.info()

print("\nFirst Few Rows:")
print(data.head())

print("\nMissing Values:")
print(data.isnull().sum())

print("\nSummary Statistics:")
print(data.describe())

num_columns = len(data.columns)

column_names = list(data.columns)

num_rows = data.shape[0]

print(f"Number of columns: {num_columns}")
print(f"Column names: {column_names}")
print(f"Number of rows: {num_rows}")

if 'TotalPhysicalRAMMB' in data.columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(data['TotalPhysicalRAMMB'], kde=True, bins=30, color='blue')
    plt.title('Distribution of Total Physical RAM (MB)')
    plt.xlabel('Total Physical RAM (MB)')
    plt.ylabel('Frequency')
    plt.show()

if 'IsGamer' in data.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x='IsGamer', data=data, palette='viridis')
    plt.title('Count of Gamers vs Non-Gamers')
    plt.xlabel('Is Gamer')
    plt.ylabel('Count')
    plt.show()

if 'PrimaryDisplayResolutionHorizontal' in data.columns and 'PrimaryDisplayResolutionVertical' in data.columns:
    resolution_count = data[
        (data['PrimaryDisplayResolutionHorizontal'] == 1366) & 
        (data['PrimaryDisplayResolutionVertical'] == 768)
    ].shape[0]
    print(f"\nNumber of systems with resolution 1366 x 768: {resolution_count}")

if 'NumAntivirusProductsInstalled' in data.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(y='NumAntivirusProductsInstalled', data=data, palette='Set2')
    plt.title('Boxplot of NumAntivirusProductsInstalled')
    plt.ylabel('Number of Antivirus Products Installed')
    plt.show()

if 'IsPassiveModeEnabled' in data.columns and 'RealTimeProtectionState' in data.columns:
    passive_mode_data = data[data['IsPassiveModeEnabled'] == 1]
    most_frequent_rtp_state = passive_mode_data['RealTimeProtectionState'].mode()[0]
    print(f"\nMost frequent value of RealTimeProtectionState when IsPassiveModeEnabled = 1: {most_frequent_rtp_state}")

high_cardinality_columns = [col for col in data.columns if data[col].nunique() > 50]
filtered_data = data.drop(columns=high_cardinality_columns)

sampled_data = filtered_data.sample(n=min(10000, len(filtered_data)), random_state=42)

print(f"Excluded high cardinality columns: {high_cardinality_columns}")

numerical_columns = sampled_data.select_dtypes(include=['int64', 'float64']).columns
for column in numerical_columns:
    if sampled_data[column].nunique() > 1:
        plt.figure(figsize=(8, 6))
        sns.histplot(sampled_data[column], kde=True, bins=30, color='blue')
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()

categorical_columns = sampled_data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    if sampled_data[column].nunique() <= 10:
        plt.figure(figsize=(10, 6))
        sns.countplot(y=column, data=sampled_data, order=sampled_data[column].value_counts().index, palette='viridis')
        plt.title(f'Distribution of {column}')
        plt.xlabel('Count')
        plt.ylabel(column)
        plt.show()

plt.figure(figsize=(12, 10))
correlation_matrix = sampled_data.select_dtypes(include=['int64', 'float64']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Milestone 2

df_copy =data.copy()

columns_to_check = ['IsBetaUser', 'IsPassiveModeEnabled', 'AntivirusConfigID', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled']

redundant_columns = [col for col in columns_to_check if df_copy[col].nunique() == 1]
print("Redundant Columns:", redundant_columns)


import numpy as np
from sklearn.preprocessing import LabelEncoder

categorical_cols = df_copy.select_dtypes(include=['object']).columns

encoder = LabelEncoder()
for col in categorical_cols:
    df_copy[col] = encoder.fit_transform(df_copy[col])
corr_matrix = df_copy.corr()
corr_pairs = corr_matrix.unstack()
corr_pairs =corr_pairs[corr_pairs < 1].sort_values(ascending=False)
print(corr_pairs.head(15))

In [None]:
# After we analysed the redundant features
X = X.drop(columns=['IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled'])

# After we analysed the covariance between the features
X = X.drop(columns=['OSSkuFriendlyName','PlatformType','OSArchitecture','OSBuildNumberOnly','NumericOSVersion','OSInstallLanguageID', 'OSProductSuite'])

# MachineID don't have any significant contribution to the models
X = X.drop(columns=['MachineID'])

In [None]:
# Pre-processing:

# Include Imputation
# Include Encoding

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# We will use different encoding for different cardinality features so that dimensionality is under control

low_cardinality_features = [col for col in categorical_features if X[col].nunique() <= 10]
high_cardinality_features = [col for col in categorical_features if X[col].nunique() > 10]

numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=20, random_state=42))
])

low_card_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('low_card', low_card_transformer, low_cardinality_features),
        ('high_card', high_card_transformer, high_cardinality_features)
    ]
)

X_imputed_encoded = preprocessor.fit_transform(X)

numeric_feature_names = numeric_features.tolist()
low_card_feature_names = preprocessor.named_transformers_['low_card']['onehot'].get_feature_names_out(low_cardinality_features).tolist()
high_card_feature_names = high_cardinality_features

all_feature_names = numeric_feature_names + low_card_feature_names + high_card_feature_names

X_imputed_encoded_df = pd.DataFrame(X_imputed_encoded, columns=all_feature_names)

In [None]:
X1 = X_imputed_encoded_df
y1 = y

In [None]:
# Final Model

from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

param_dist = {
    'randomforestclassifier__n_estimators': [50, 100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__class_weight': ['balanced', None]
}

pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,
    cv=10,
    scoring='f1',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_params_random = random_search.best_params_
best_model_random = random_search.best_estimator_

print("Best Parameters from RandomizedSearchCV:", best_params_random)

param_grid = {
    'randomforestclassifier__n_estimators': [
        max(10, best_params_random['randomforestclassifier__n_estimators'] - 50),
        best_params_random['randomforestclassifier__n_estimators'],
        best_params_random['randomforestclassifier__n_estimators'] + 50],
    'randomforestclassifier__max_depth': [
        None if best_params_random['randomforestclassifier__max_depth'] is None else max(1, best_params_random['randomforestclassifier__max_depth'] - 5),
        best_params_random['randomforestclassifier__max_depth'],
        None if best_params_random['randomforestclassifier__max_depth'] is None else best_params_random['randomforestclassifier__max_depth'] + 5],
    'randomforestclassifier__min_samples_split': [best_params_random['randomforestclassifier__min_samples_split']],
    'randomforestclassifier__min_samples_leaf': [best_params_random['randomforestclassifier__min_samples_leaf']],
    'randomforestclassifier__bootstrap': [best_params_random['randomforestclassifier__bootstrap']],
    'randomforestclassifier__class_weight': [best_params_random.get('randomforestclassifier__class_weight', None)]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params_grid = grid_search.best_params_
best_model_grid = grid_search.best_estimator_

print("Best Parameters from GridSearchCV:", best_params_grid)

y_pred = best_model_grid.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"Validation Accuracy after GridSearchCV: {accuracy:.4f}")
print("\nClassification Report:\n", report)
print("\Confusion Matrix:\n", confusion_matrix(y_val,y_pred))

In [None]:
# SGD Model

from sklearn.linear_model import SGDClassifier

param_dist = {
    'sgdclassifier__alpha': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
    'sgdclassifier__penalty': ['l2', 'l1', 'elasticnet'],
    'sgdclassifier__max_iter': [1000, 3000, 5000, 10000, 20000],
    'sgdclassifier__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'sgdclassifier__eta0': [0.0001, 0.001, 0.01],
    'sgdclassifier__early_stopping': [True],
    'sgdclassifier__validation_fraction': [0.1, 0.2],
    'sgdclassifier__class_weight': ['balanced', None],
    'sgdclassifier__tol': [1e-4, 1e-3, 1e-2]
}

pipeline = make_pipeline(StandardScaler(), SGDClassifier(loss='log_loss', random_state=42))

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,
    cv=10,
    scoring='f1',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_params_random = random_search.best_params_
best_model_random = random_search.best_estimator_

print("Best Parameters from RandomizedSearchCV:", best_params_random)

# Use the best parameters from RandomizedSearchCV to define a narrower grid for GridSearchCV
param_grid = {
    'sgdclassifier__alpha': [best_params_random['sgdclassifier__alpha'] * 0.1,
                             best_params_random['sgdclassifier__alpha'],
                             best_params_random['sgdclassifier__alpha'] * 10],
    'sgdclassifier__penalty': [best_params_random['sgdclassifier__penalty']],
    'sgdclassifier__max_iter': [best_params_random['sgdclassifier__max_iter']],
    'sgdclassifier__learning_rate': [best_params_random['sgdclassifier__learning_rate']],
    'sgdclassifier__eta0': [best_params_random['sgdclassifier__eta0']],
    'sgdclassifier__early_stopping': [True],
    'sgdclassifier__validation_fraction': [best_params_random['sgdclassifier__validation_fraction']],
    'sgdclassifier__class_weight': [best_params_random.get('sgdclassifier__class_weight', None)]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params_grid = grid_search.best_params_
best_model_grid_sgd = grid_search.best_estimator_

print("Best Parameters from GridSearchCV:", best_params_grid)

y_pred = best_model_grid_sgd.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"Validation Accuracy after GridSearchCV: {accuracy:.4f}")
print("\nClassification Report:\n", report)
print("\Confusion Matrix:\n", confusion_matrix(y_val,y_pred))

In [None]:
# Logistic Regression Model

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif

X_log = data.drop(columns=['target'])
y_log = data['target']

numeric_features = X_log.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_log.select_dtypes(include=['object']).columns

numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

X_log[numeric_features] = numeric_imputer.fit_transform(X_log[numeric_features])
X_log[categorical_features] = categorical_imputer.fit_transform(X_log[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_log[col] = le.fit_transform(X_log[col].astype(str))
    label_encoders[col] = le

scaler = StandardScaler()
X_log[numeric_features] = scaler.fit_transform(X_log[numeric_features])

poly = PolynomialFeatures(degree=2, include_bias=False)
numeric_poly = poly.fit_transform(X_log[numeric_features])
poly_feature_names = [f'poly_{i}' for i in range(numeric_poly.shape[1])]
X_poly = pd.DataFrame(numeric_poly, columns=poly_feature_names)

X_log = pd.concat([X_log, X_poly], axis=1)

selector = SelectKBest(score_func=f_classif, k=50)  # Select top 50 features
X_selected = selector.fit_transform(X_log, y_log)
selected_features_mask = selector.get_support()

all_features = list(X_log.columns)
selected_features = [feature for feature, selected in zip(all_features, selected_features_mask) if selected]

X_log = X_log[selected_features]

X_train, X_val, y_train, y_val = train_test_split(X_log, y_log, test_size=0.2, random_state=42, stratify=y_log)

class_weights = compute_class_weight('balanced', classes=np.unique(y_log), y=y_log)
class_weight_dict = dict(zip(np.unique(y_log), class_weights))

lr_model = LogisticRegression(
    max_iter=1000,
    class_weight=class_weight_dict,
    C=0.1,
    solver='saga',
    penalty='l1',
    random_state=42
)

lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"Validation Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n",confusion_matrix(y_val,y_pred))

In [None]:
# Other Models

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

classifiers = [
    ('GB', GradientBoostingClassifier(random_state=42)),
    ('MLP', MLPClassifier(random_state=42)),
]

for name, clf in classifiers:
    pipeline = make_pipeline(StandardScaler(), clf)
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_val, y_val)
    y_pred = pipeline.predict(X_val)
    print(f"{name} Accuracy: {score:.4f}")
    print("\nClassification Report:\n", classification_report(y_val, y_pred))
    print("\nConfusion Matrix:\n",confusion_matrix(y_val,y_pred))

In [None]:
# I tried the gradient boosting model with hyperparameter tuning also but it was taking a very very long time so finally i decided to only use the Random forest as my final model
"""
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

pipeline = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))

param_dist = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200, 300],
    'gradientboostingclassifier__max_depth': [3, 5, 7, 10],
    'gradientboostingclassifier__min_samples_split': [2, 5, 10],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'gradientboostingclassifier__learning_rate': [0.01, 0.05, 0.1],
    'gradientboostingclassifier__subsample': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='f1',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_params_random = random_search.best_params_

param_grid = {
    'gradientboostingclassifier__n_estimators': [
        max(10, best_params_random['gradientboostingclassifier__n_estimators'] - 50),
        best_params_random['gradientboostingclassifier__n_estimators'],
        best_params_random['gradientboostingclassifier__n_estimators'] + 50],
    
    'gradientboostingclassifier__max_depth': [
        max(1, best_params_random['gradientboostingclassifier__max_depth'] - 2),
        best_params_random['gradientboostingclassifier__max_depth'],
        best_params_random['gradientboostingclassifier__max_depth'] + 2],

    'gradientboostingclassifier__min_samples_split': [best_params_random['gradientboostingclassifier__min_samples_split']],
    'gradientboostingclassifier__min_samples_leaf': [best_params_random['gradientboostingclassifier__min_samples_leaf']],
    'gradientboostingclassifier__learning_rate': [best_params_random['gradientboostingclassifier__learning_rate']],
    'gradientboostingclassifier__subsample': [best_params_random['gradientboostingclassifier__subsample']]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model_grid = grid_search.best_estimator_

y_pred = best_model_grid.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"Validation Accuracy after GridSearchCV: {accuracy:.4f}")
print("\nClassification Report:\n", report)
"""

In [None]:
# Load test data
test_data = pd.read_csv('test.csv')

# Droping unnecessary columns (same as training data preprocessing)
test_data = test_data.drop(columns=['target','MachineID','OSSkuFriendlyName','PlatformType','OSArchitecture','OSBuildNumberOnly','NumericOSVersion','OSInstallLanguageID', 'OSProductSuite','IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled'], errors='ignore')

In [None]:
# Apply the same preprocessor to the test data
X_test_imputed_encoded = preprocessor.transform(test_data)  # Use transform (not fit_transform) on test data

# Convert the transformed test data back to a DataFrame with proper column names
X_test_imputed_encoded_df = pd.DataFrame(X_test_imputed_encoded, columns=all_feature_names)

# Make predictions using the trained model
y_pred = best_model_grid.predict(X_test_imputed_encoded_df)

In [None]:
# Finding the accuracy of the final model
from sklearn.metrics import accuracy_score

y_test = pd.read_csv("true_labels.csv")['target']
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

In [None]:
# Milestone 1

data_cleaned = data.dropna()

unique_os_versions = data_cleaned['OSVersion'].nunique()
print(f"Number of unique OS versions: {unique_os_versions}")
max_antivirus_installed = data_cleaned['NumAntivirusProductsInstalled'].max()
print(f"Maximum value of NumAntivirusProductsInstalled: {max_antivirus_installed}")
gamers_with_malware = data_cleaned[(data_cleaned['IsGamer'] == 1) & (data_cleaned['target'] == 1)].shape[0]
print(f"Number of systems owned by gamers where malware was detected: {gamers_with_malware}")
most_frequent_rtp_state = data_cleaned[data_cleaned['IsPassiveModeEnabled'] == 1]['RealTimeProtectionState'].mode()[0]
print(f"Most frequent value of RealTimeProtectionState: {most_frequent_rtp_state}")
resolution_count = data_cleaned[
    (data_cleaned['PrimaryDisplayResolutionHorizontal'] == 1366) & 
    (data_cleaned['PrimaryDisplayResolutionVertical'] == 768)
].shape[0]

print(f"Number of systems with resolution 1366 x 768: {resolution_count}")
percentile_50_ram = data_cleaned['TotalPhysicalRAMMB'].quantile(0.5)
print(f"50th percentile value of TotalPhysicalRAMMB: {percentile_50_ram}")

In [None]:
# Milestone 2

df = data.copy()

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

cat_df = df.select_dtypes(include=['object'])
low_cardinality_cols = cat_df.nunique()[cat_df.nunique() <= 10].index

encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_cols = encoder.fit_transform(cat_df[low_cardinality_cols])

encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(low_cardinality_cols))

cat_df = cat_df.drop(columns=low_cardinality_cols)
cat_df = pd.concat([cat_df, encoded_df], axis=1)
cat_df.shape[1]

from sklearn.preprocessing import MinMaxScaler
num_df = data.select_dtypes(include=['int64', 'float64'])

scaler = MinMaxScaler()
num_df_scaled = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns)

total_sum = num_df_scaled.sum().sum()

print("The sum of all values in num_df after scaling is:", total_sum)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

X = df.drop('target', axis=1)
y = df['target']

imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

encoder = OrdinalEncoder()
X_imputed[X.select_dtypes(include=['object']).columns] = encoder.fit_transform(
    X_imputed[X.select_dtypes(include=['object']).columns]
)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

clf = SGDClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

In [None]:
# Milestone 3

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

X = data.drop(columns=['target'])
y = data['target']

categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['number']).columns

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

label_encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, cv=3, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

best_params_dt = grid_search_dt.best_params_
best_dt_model = grid_search_dt.best_estimator_
val_accuracy_dt = accuracy_score(y_val, best_dt_model.predict(X_val))

print("Best Parameters for Decision Tree:", best_params_dt)
print("Validation Accuracy (Decision Tree): {:.5f}".format(val_accuracy_dt))
print(round(val_accuracy_dt,2))

adaboost_model = AdaBoostClassifier(random_state=42)
param_grid_ada = {
    'n_estimators': [10, 20, 30],
    'learning_rate': [5, 10],
    'algorithm': ['SAMME']
}

grid_search_ada = GridSearchCV(estimator=adaboost_model, param_grid=param_grid_ada, cv=3, scoring='accuracy')
grid_search_ada.fit(X_train, y_train)

best_params_ada = grid_search_ada.best_params_
best_ada_model = grid_search_ada.best_estimator_
val_accuracy_ada = accuracy_score(y_val, best_ada_model.predict(X_val))

print("Best Parameters for AdaBoost:", best_params_ada)
print("Validation Accuracy (AdaBoost): {:.5f}".format(val_accuracy_ada))
print(round(val_accuracy_ada,2))

In [None]:
# Milestone 4

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_squared_error

data_clean = data.dropna()

X = data_clean.drop(columns=['target'])
y = data_clean['target']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns
encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
components_70 = np.argmax(cumulative_variance >= 0.70) + 1
print(f"Number of components to explain 70% variance: {components_70}")

pca_70 = PCA(n_components=components_70, svd_solver='full')
X_reduced_70 = pca_70.fit_transform(X_scaled)
X_reconstructed = pca_70.inverse_transform(X_reduced_70)
mse_70 = mean_squared_error(X_scaled, X_reconstructed)
print(f"MSE after reconstruction with {components_70} components: {mse_70}")

pca_40 = PCA(n_components=40, svd_solver='full')
pca_40.fit(X_scaled)
variance_40 = np.sum(pca_40.explained_variance_ratio_)
print(f"Variance explained by 40 components: {variance_40}")

selector = SelectKBest(score_func=f_classif, k=15)
X_selected = selector.fit_transform(X_scaled, y)
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]
print(f"Top 15 selected features: {list(selected_features)}")

to_check = [
    'AntivirusConfigID',
    'RegionIdentifier',
    'OSBuildNumber',
    'DateAS',
    'SignatureVersion',
    'PowerPlatformRole',
    'IsPenCapable'
]
not_selected = [col for col in to_check if col not in selected_features]
print(f"Columns NOT in top 15 features: {not_selected}")

scores = selector.scores_[selected_indices]
best_score_idx = np.argmax(scores)
best_feature = selected_features[best_score_idx]
best_score = scores[best_score_idx]
print(f"Best feature: {best_feature} with a score of {best_score}")

from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

k = 10
selector = SelectKBest(score_func=f_regression, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_feature_indices = selector.get_support(indices=True)
selected_features = [f"Feature_{i}" for i in selected_feature_indices]  # Placeholder names for selected features
print(f"Top {k} selected features: {selected_features}")

lasso = Lasso(alpha=1.0)  # Regularization strength (tune alpha as needed)
lasso.fit(X_train_selected, y_train)
lasso_coefficients = lasso.coef_

print("\nLasso Coefficients:")
for feature, coef in zip(selected_features, lasso_coefficients):
    print(f"{feature}: {coef}")

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_selected, y_train)
ridge_coefficients = ridge.coef_

print("\nRidge Coefficients:")
for feature, coef in zip(selected_features, ridge_coefficients):
    print(f"{feature}: {coef}")

print("\nInsights:")
print("Lasso tends to shrink some coefficients to exactly zero, effectively performing feature selection.")
print("Ridge shrinks coefficients but does not set them to zero, which can be useful when all features are informative.")

In [None]:
# Milestone 5

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = data.copy()

X = df.drop(columns=['target'])
y = df['target']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Model - 1
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

cm_dt = confusion_matrix(y_test, y_pred_dt)

correct_classifications_dt = np.trace(cm_dt)
incorrect_class1_as_0_dt = cm_dt[1, 0]
precision_class0_dt = precision_score(y_test, y_pred_dt, pos_label=0)

print("Decision Tree Results:")
print(f"Correctly classified samples: {correct_classifications_dt}")
print(f"Class 1 incorrectly classified as 0: {incorrect_class1_as_0_dt}")
print(f"Precision for class 0: {precision_class0_dt:.3f}")

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, recall_score

ada_model = AdaBoostClassifier(n_estimators=10, learning_rate=10, random_state=42)
ada_model.fit(X_train, y_train)

y_pred_ada = ada_model.predict(X_test)

cm_ada = confusion_matrix(y_test, y_pred_ada)

correct_classifications_ada = np.trace(cm_ada)
incorrect_class1_as_0_ada = cm_ada[1, 0]
recall_class1_ada = recall_score(y_test, y_pred_ada, pos_label=1)

print("\nAdaBoost Results:")
print(f"Correctly classified samples: {correct_classifications_ada}")
print(f"Class 1 incorrectly classified as 0: {incorrect_class1_as_0_ada}")
print(f"Recall for class 1: {recall_class1_ada:.2f}")

# Model - 3
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score

logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

y_pred_logreg = logreg_model.predict(X_test)

cm_logreg = confusion_matrix(y_test, y_pred_logreg)

correct_classifications_logreg = np.trace(cm_logreg)
incorrect_class0_as_1_logreg = cm_logreg[0, 1]
recall_class1_logreg = recall_score(y_test, y_pred_logreg, pos_label=1)

print("\nLogistic Regression Results:")
print(f"Correctly classified samples: {correct_classifications_logreg}")
print(f"Class 0 incorrectly classified as 1: {incorrect_class0_as_1_logreg}")
print(f"Recall for class 1: {recall_class1_logreg:.3f}")