In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# 1. Load data
df = pd.read_csv('/kaggle/input/dataw2/benchmark_ml_features.csv')  # Replace with your actual file name

# 2. Prepare features and labels
X = df.drop(columns=['App', 'Label'])
y = df['Label']

# 3. Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Train a classifier (Random Forest is robust for this kind of data)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


# Save the trained model to a file
# joblib.dump(model, 'malware_detector_model.pkl')


In [8]:
import sklearn
import xgboost
import numpy
import cython
import scipy
print(sklearn.__version__)
print(xgboost.__version__)
print(numpy.__version__)
print(joblib.__version__)


0.21.3
0.90
1.16.4
0.13.2


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib
# 1. Load
df = pd.read_csv('/kaggle/input/dataw2/benchmark_ml_features.csv')
X = df.drop(['App', 'Label'], axis=1)
y = df['Label']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 3. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 4. SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

# 5. XGBoost + GridSearch
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    ),
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=2,
    n_jobs=-1
)
grid.fit(X_resampled, y_resampled)
best_model = grid.best_estimator_

# 6. Evaluate
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Threshold tuning
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
best_f1, best_thr = 0, 0.5
for thr in [i / 100 for i in range(25, 76)]:
    preds = (y_proba >= thr).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1:
        best_f1, best_thr = score, thr
print(f'Best F1 {best_f1:.3f} at threshold {best_thr}')

# joblib.dump(best_model, 'detect_malware_x.pkl')
# joblib.dump(scaler, 'scaler_x.pkl')


Using TensorFlow backend.


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 15.4min finished


              precision    recall  f1-score   support

           0      0.737     0.750     0.743        56
           1      0.745     0.732     0.739        56

    accuracy                          0.741       112
   macro avg      0.741     0.741     0.741       112
weighted avg      0.741     0.741     0.741       112

Confusion matrix:
 [[42 14]
 [15 41]]
Best F1 0.820 at threshold 0.33


TypeError: save_model() takes 2 positional arguments but 3 were given

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
import joblib

# 1. Load
df = pd.read_csv('/kaggle/input/dataw2/benchmark_ml_features.csv')
X = df.drop(['App', 'Label'], axis=1)
y = df['Label']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 3. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

# 5. LightGBM + GridSearch
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    LGBMClassifier(random_state=42, n_jobs=-1),
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=2,
    n_jobs=-1
)
grid.fit(X_resampled, y_resampled)
best_model = grid.best_estimator_

# 6. Evaluate
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Threshold tuning
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
best_f1, best_thr = 0, 0.5
for thr in [i / 100 for i in range(25, 76)]:
    preds = (y_proba >= thr).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1:
        best_f1, best_thr = score, thr
print(f'Best F1 {best_f1:.3f} at threshold {best_thr}')

# 8. Save
# joblib.dump(best_model, 'detect_malware_lightgbm.pkl')
# joblib.dump(scaler, 'scaler_lightgbm.pkl')

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.2min


              precision    recall  f1-score   support

           0      0.732     0.732     0.732        56
           1      0.732     0.732     0.732        56

    accuracy                          0.732       112
   macro avg      0.732     0.732     0.732       112
weighted avg      0.732     0.732     0.732       112

Confusion matrix:
 [[41 15]
 [15 41]]
Best F1 0.803 at threshold 0.33


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 11.4min finished


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

# 1. Load
df = pd.read_excel('/kaggle/input/final-dataset/final_apis.xlsx')
X = df.drop(['App', 'Label'], axis=1)
y = df['Label']

# Check initial data
print("Original X shape:", X.shape)
print("Original y shape:", y.shape)
print("NaN values in X:\n", X.isna().sum())
print("Infinite values in X:", np.any(np.isinf(X)))
print("Unique values in y:", y.unique())

# Handle NaN and infinite values by filling with median
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())  # Fill NaN with column median
print("NaN values after filling:\n", X.isna().sum())
print("Infinite values after filling:", np.any(np.isinf(X)))
print("X shape after cleaning:", X.shape)

# Remove low-variance columns
variances = X.var()
low_variance_cols = variances[variances < 1e-10].index
X = X.drop(columns=low_variance_cols)
print("Dropped low-variance columns:", low_variance_cols)

# Ensure dataset is not empty
if X.shape[0] == 0:
    raise ValueError("Dataset is empty after cleaning. Check data for excessive NaN/infinite values.")

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, stratify=y, random_state=42
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# 3. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check scaled data
print("NaN in X_train_scaled:", np.any(np.isnan(X_train_scaled)))
print("Infinite in X_train_scaled:", np.any(np.isinf(X_train_scaled)))

# 4. SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

# 5. XGBoost + GridSearch
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    XGBClassifier(
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ),
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=2,
    n_jobs=-1
)
grid.fit(X_resampled, y_resampled)
best_model = grid.best_estimator_

# 6. Evaluate
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Threshold tuning
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
best_f1, best_thr = 0, 0.5
for thr in [i / 100 for i in range(25, 76)]:
    preds = (y_proba >= thr).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1:
        best_f1, best_thr = score, thr
print(f'Best F1 {best_f1:.3f} at threshold {best_thr}')

# Save models
joblib.dump(best_model, 'detect_malware_x.pkl')
joblib.dump(scaler, 'scaler_x.pkl')

Using TensorFlow backend.


Original X shape: (224, 474)
Original y shape: (224,)
NaN values in X:
 Callback_onCreate                 0
Callback_onDestroy                0
Callback_onReceive                0
Callback_onLowMemory              0
Callback_onCreateContextMenu      0
                               ... 
Callback_onDraw_ViewPager       179
Callback_onLowMemory_2          179
Callback_onMeasure_2            179
Callback_onTouchEvent_2         179
Callback_onDraw_2               179
Length: 474, dtype: int64
Infinite values in X: False
Unique values in y: [0 1]
NaN values after filling:
 Callback_onCreate               0
Callback_onDestroy              0
Callback_onReceive              0
Callback_onLowMemory            0
Callback_onCreateContextMenu    0
                               ..
Callback_onDraw_ViewPager       0
Callback_onLowMemory_2          0
Callback_onMeasure_2            0
Callback_onTouchEvent_2         0
Callback_onDraw_2               0
Length: 474, dtype: int64
Infinite values after fil

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [2]:
best_model._Booster.save_model('detect_malware_1_new.xgb')

In [1]:
import pandas as pd
import joblib
import numpy as np

# List of features from your training set
FEATURE_NAMES = [
    "VIDEO", "BLUETOOTH_INFORMATION", "CALENDAR_INFORMATION", "SMS_MMS", "ACCOUNT_INFORMATION",
    "EMAIL_INFORMATION", "FILE_INFORMATION", "SYNCHRONIZATION_DATA", "PHONE_CONNECTION", "NETWORK",
    "AUDIO", "IMAGE", "ACCOUNT_SETTINGS", "VOIP", "FILE", "DATABASE_INFORMATION",
    "NETWORK_INFORMATION", "HARDWARE_INFO", "NFC", "SYSTEM_SETTINGS", "CONTACT_INFORMATION",
    "VOIP_INFORMATION", "PHONE_INFORMATION", "EMAIL_SETTINGS", "PHONE_STATE", "BROWSER_INFORMATION",
    "NO_CATEGORY", "LOG", "BLUETOOTH", "UNIQUE_IDENTIFIER", "INTER_APP_COMMUNICATION",
    "EMAIL", "ALL", "LOCATION_INFORMATION"
]

def parse_inconsistent_src_txt(src_txt_path, feature_names):
    """
    Parses a `src.txt` where each line is: `FEATURE_NAME val1 val2 val3 ...`
    Extracts a single numeric summary (sum) per feature.

    Args:
        src_txt_path (str): Path to your src.txt
        feature_names (list): List of features to consider.

    Returns:
        pd.DataFrame: DataFrame with one row containing feature values aligned to feature_names, or None if error.
    """
    feature_dict = {feat: 0 for feat in feature_names}

    try:
        with open(src_txt_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue
                feat_name = parts[0]
                if feat_name not in feature_dict:
                    print(f"Warning: Unknown feature '{feat_name}' ignored.")
                    continue
                # Convert the rest to numbers safely
                try:
                    values = [float(x) for x in parts[1:] if x.strip() != '']
                    feature_dict[feat_name] = float(np.sum(values)) if values else 0.0
                except ValueError as e:
                    print(f"Warning: Could not parse values for '{feat_name}': {e}. Setting to 0.")
                    feature_dict[feat_name] = 0.0
    except Exception as e:
        print(f"Error parsing src.txt at '{src_txt_path}': {e}")
        return None

    # Create DataFrame for model input
    feature_df = pd.DataFrame([feature_dict])
    # Ensure no NaN values
    feature_df = feature_df.fillna(0)
    return feature_df

# --- SET THESE PATHS ---
MODEL_PATH = '/kaggle/input/maal/scikitlearn/default/1/detect_malware_x.pkl'  # Path to your trained model
SCALER_PATH = '/kaggle/input/scal/scikitlearn/default/1/scaler_x.pkl'         # Path to your scaler
SRC_TXT_PATH = '/kaggle/input/malicious80/src.txt'                            # Path to your src.txt

# --- PREDICTION CODE ---
try:
    model = joblib.load(MODEL_PATH)
    scaler = joblib.load(SCALER_PATH)
except Exception as e:
    print(f"Error loading model or scaler: {e}")
    exit()

# Parse features
fv = parse_inconsistent_src_txt(SRC_TXT_PATH, FEATURE_NAMES)

# Check if parsing was successful
if fv is None or fv.empty:
    print("Error: Feature vector is None or empty. Check src.txt file and parsing logic.")
    exit()

# Verify the DataFrame
print("Parsed Features:", fv)

# Ensure the DataFrame has the correct shape
if fv.shape[1] != len(FEATURE_NAMES):
    print(f"Error: Expected {len(FEATURE_NAMES)} features, got {fv.shape[1]}.")
    exit()

# Scale and predict
try:
    fv_scaled = scaler.transform(fv)
    pred = model.predict(fv_scaled)[0]
    proba = model.predict_proba(fv_scaled)[0][1]  # Probability of malware
    print(f"Prediction: {'Malicious' if pred == 1 else 'Benign'}")
    print(f"Malicious probability: {proba:.4f}")
except Exception as e:
    print(f"Error during scaling or prediction: {e}")

Parsed Features:    VIDEO  BLUETOOTH_INFORMATION  CALENDAR_INFORMATION  SMS_MMS  \
0    9.0                    9.0                   9.0      9.0   

   ACCOUNT_INFORMATION  EMAIL_INFORMATION  FILE_INFORMATION  \
0                  9.0                9.0               9.0   

   SYNCHRONIZATION_DATA  PHONE_CONNECTION  NETWORK  ...  PHONE_STATE  \
0                   9.0               9.0      9.0  ...          9.0   

   BROWSER_INFORMATION  NO_CATEGORY  LOG  BLUETOOTH  UNIQUE_IDENTIFIER  \
0                  9.0          9.0  9.0        9.0                9.0   

   INTER_APP_COMMUNICATION  EMAIL  ALL  LOCATION_INFORMATION  
0                      9.0    9.0  9.0                  17.0  

[1 rows x 34 columns]
Prediction: Malicious
Malicious probability: 0.9720


In [2]:
import pandas as pd
import joblib
import numpy as np

# List of features from your training set
FEATURE_NAMES = [
    "VIDEO", "BLUETOOTH_INFORMATION", "CALENDAR_INFORMATION", "SMS_MMS", "ACCOUNT_INFORMATION",
    "EMAIL_INFORMATION", "FILE_INFORMATION", "SYNCHRONIZATION_DATA", "PHONE_CONNECTION", "NETWORK",
    "AUDIO", "IMAGE", "ACCOUNT_SETTINGS", "VOIP", "FILE", "DATABASE_INFORMATION",
    "NETWORK_INFORMATION", "HARDWARE_INFO", "NFC", "SYSTEM_SETTINGS", "CONTACT_INFORMATION",
    "VOIP_INFORMATION", "PHONE_INFORMATION", "EMAIL_SETTINGS", "PHONE_STATE", "BROWSER_INFORMATION",
    "NO_CATEGORY", "LOG", "BLUETOOTH", "UNIQUE_IDENTIFIER", "INTER_APP_COMMUNICATION",
    "EMAIL", "ALL", "LOCATION_INFORMATION"
]

def parse_inconsistent_src_txt(src_txt_path, feature_names):
    """
    Parses a `src.txt` where each line is: `FEATURE_NAME val1 val2 val3 ...`
    Extracts a single numeric summary (sum) per feature.

    Args:
        src_txt_path (str): Path to your src.txt
        feature_names (list): List of features to consider.

    Returns:
        pd.DataFrame: DataFrame with one row containing feature values aligned to feature_names, or None if error.
    """
    feature_dict = {feat: 0 for feat in feature_names}

    try:
        with open(src_txt_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue
                feat_name = parts[0]
                if feat_name not in feature_dict:
                    print(f"Warning: Unknown feature '{feat_name}' ignored.")
                    continue
                # Convert the rest to numbers safely
                try:
                    values = [float(x) for x in parts[1:] if x.strip() != '']
                    feature_dict[feat_name] = float(np.sum(values)) if values else 0.0
                except ValueError as e:
                    print(f"Warning: Could not parse values for '{feat_name}': {e}. Setting to 0.")
                    feature_dict[feat_name] = 0.0
    except Exception as e:
        print(f"Error parsing src.txt at '{src_txt_path}': {e}")
        return None

    # Create DataFrame for model input
    feature_df = pd.DataFrame([feature_dict])
    # Ensure no NaN values
    feature_df = feature_df.fillna(0)
    return feature_df

# --- SET THESE PATHS ---
MODEL_PATH = '/kaggle/input/maal/scikitlearn/default/1/detect_malware_x.pkl'  # Path to your trained model
SCALER_PATH = '/kaggle/input/scal/scikitlearn/default/1/scaler_x.pkl'         # Path to your scaler
SRC_TXT_PATH = '/kaggle/input/source/src.txt'                            # Path to your src.txt

# --- PREDICTION CODE ---
try:
    model = joblib.load(MODEL_PATH)
    scaler = joblib.load(SCALER_PATH)
except Exception as e:
    print(f"Error loading model or scaler: {e}")
    exit()

# Parse features
fv = parse_inconsistent_src_txt(SRC_TXT_PATH, FEATURE_NAMES)

# Check if parsing was successful
if fv is None or fv.empty:
    print("Error: Feature vector is None or empty. Check src.txt file and parsing logic.")
    exit()

# Verify the DataFrame
print("Parsed Features:", fv)

# Ensure the DataFrame has the correct shape
if fv.shape[1] != len(FEATURE_NAMES):
    print(f"Error: Expected {len(FEATURE_NAMES)} features, got {fv.shape[1]}.")
    exit()

# Scale and predict
try:
    fv_scaled = scaler.transform(fv)
    pred = model.predict(fv_scaled)[0]
    proba = model.predict_proba(fv_scaled)[0][1]  # Probability of malware
    print(f"Prediction: {'Malicious' if pred == 1 else 'Benign'}")
    print(f"Malicious probability: {proba:.4f}")
except Exception as e:
    print(f"Error during scaling or prediction: {e}")

Parsed Features:    VIDEO  BLUETOOTH_INFORMATION  CALENDAR_INFORMATION  SMS_MMS  \
0    6.0                    6.0                   6.0      6.0   

   ACCOUNT_INFORMATION  EMAIL_INFORMATION  FILE_INFORMATION  \
0                  6.0                6.0               6.0   

   SYNCHRONIZATION_DATA  PHONE_CONNECTION  NETWORK  ...  PHONE_STATE  \
0                   6.0               6.0      6.0  ...          6.0   

   BROWSER_INFORMATION  NO_CATEGORY  LOG  BLUETOOTH  UNIQUE_IDENTIFIER  \
0                  6.0          6.0  6.0        6.0                6.0   

   INTER_APP_COMMUNICATION  EMAIL  ALL  LOCATION_INFORMATION  
0                      6.0    6.0  6.0                   6.0  

[1 rows x 34 columns]
Prediction: Benign
Malicious probability: 0.0315


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

# 1. Load data
df = pd.read_excel('/kaggle/input/dataaaa/5144fb53-6896-41f6-89fe-d02170b14103.xlsx')

# Handle both possible column names for app identifier
app_col = 'App Name' if 'App Name' in df.columns else 'App'
X = df.drop([app_col, 'Label'], axis=1)
y = df['Label']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 3. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

# 5. XGBoost + GridSearch
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    ),
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=2,
    n_jobs=-1
)
grid.fit(X_resampled, y_resampled)
best_model = grid.best_estimator_

# 6. Evaluate
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Threshold tuning
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
best_f1, best_thr = 0, 0.5
for thr in [i / 100 for i in range(25, 76)]:
    preds = (y_proba >= thr).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1:
        best_f1, best_thr = score, thr
print(f'Best F1 {best_f1:.3f} at threshold {best_thr}')

# 8. Save both model and scaler
joblib.dump(best_model, 'detect_malware.pkl')
joblib.dump(scaler, 'scaler.pkl')

# 9. Save feature names for automation script
feature_names = list(X.columns)
with open('feature_names.txt', 'w') as f:
    for feature in feature_names:
        f.write(f"{feature}\n")

print("✅ Model saved as detect_malware.pkl")
print("✅ Scaler saved as scaler.pkl") 
print("✅ Feature names saved as feature_names.txt")
print(f"✅ Total features: {len(feature_names)}")


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

def train_malware_detection_model():
    """Complete training pipeline with comprehensive error handling"""
    
    # Configuration
    DATA_PATH = '/kaggle/input/dataw2/benchmark_ml_features.csv'  # Update this path
    SAVE_DIR = '/kaggle/working/'
    
    print("🚀 Starting malware detection model training...")
    
    # 1. Load and validate data
    try:
        if DATA_PATH.endswith('.xlsx'):
            df = pd.read_excel(DATA_PATH)
        else:
            df = pd.read_csv(DATA_PATH)
        print(f"✅ Data loaded successfully: {df.shape}")
    except Exception as e:
        print(f"❌ Failed to load data: {e}")
        return False
    
    # Handle different possible column names
    app_column = None
    label_column = None
    
    for col in df.columns:
        if col.lower() in ['app', 'app name', 'app_name']:
            app_column = col
        elif col.lower() in ['label', 'class', 'target']:
            label_column = col
    
    if not app_column or not label_column:
        print(f"❌ Required columns not found. Available columns: {list(df.columns)}")
        return False
    
    print(f"📊 Using columns - App: '{app_column}', Label: '{label_column}'")
    
    # Prepare features and target
    X = df.drop([app_column, label_column], axis=1)
    y = df[label_column]
    
    # Convert non-numeric columns to numeric
    for col in X.columns:
        if X[col].dtype == 'object':
            try:
                X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
            except:
                X[col] = 0
    
    print(f"📈 Feature matrix shape: {X.shape}")
    print(f"🎯 Class distribution: {y.value_counts().to_dict()}")
    
    # 2. Split data
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, stratify=y, random_state=42
        )
        print("✅ Data split completed")
    except Exception as e:
        print(f"❌ Data splitting failed: {e}")
        return False
    
    # 3. Scale features
    try:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("✅ Feature scaling completed")
    except Exception as e:
        print(f"❌ Feature scaling failed: {e}")
        return False
    
    # 4. Handle class imbalance with SMOTE
    try:
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
        print(f"✅ SMOTE applied - New shape: {X_resampled.shape}")
    except Exception as e:
        print(f"❌ SMOTE failed: {e}")
        return False
    
    # 5. Train model with grid search
    try:
        param_grid = {
            'max_depth': [3, 6],
            'learning_rate': [0.1, 0.2],
            'n_estimators': [100, 200]
        }
        
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        xgb_model = XGBClassifier(
            eval_metric='logloss',
            random_state=42,
            n_jobs=1
        )
        
        grid = GridSearchCV(
            xgb_model,
            param_grid,
            scoring='f1',
            cv=cv,
            verbose=1,
            n_jobs=1
        )
        
        print("🔧 Training model with grid search...")
        grid.fit(X_resampled, y_resampled)
        best_model = grid.best_estimator_
        print(f"✅ Model training completed. Best params: {grid.best_params_}")
        
    except Exception as e:
        print(f"❌ Model training failed: {e}")
        return False
    
    # 6. Evaluate model
    try:
        y_pred = best_model.predict(X_test_scaled)
        print("\n📊 Model Evaluation:")
        print(classification_report(y_test, y_pred, digits=3))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
    except Exception as e:
        print(f"⚠️ Evaluation failed: {e}")
    
    # 7. Save all components
    try:
        os.makedirs(SAVE_DIR, exist_ok=True)
        
        # Save model using XGBoost native format (most reliable)
        model_path = os.path.join(SAVE_DIR, 'detect_malware_small.json')
        best_model.save_model(model_path)
        print(f"✅ Model saved: {model_path}")
        
        # Save scaler
        scaler_path = os.path.join(SAVE_DIR, 'scaler_small.pkl')
        joblib.dump(scaler, scaler_path)
        print(f"✅ Scaler saved: {scaler_path}")
        
        # Save feature names
        feature_names_path = os.path.join(SAVE_DIR, 'feature_names_small.txt')
        with open(feature_names_path, 'w') as f:
            for feature in X.columns:
                f.write(f"{feature}\n")
        print(f"✅ Feature names saved: {feature_names_path}")
        
        # Save metadata
        metadata_path = os.path.join(SAVE_DIR, 'model_metadata.txt')
        with open(metadata_path, 'w') as f:
            f.write(f"Model Type: XGBoost\n")
            f.write(f"Features: {len(X.columns)}\n")
            f.write(f"Training Samples: {len(X_train)}\n")
            f.write(f"Test Samples: {len(X_test)}\n")
            f.write(f"Best Parameters: {grid.best_params_}\n")
        
        print("🎉 All components saved successfully!")
        return True
        
    except Exception as e:
        print(f"❌ Failed to save components: {e}")
        return False

if __name__ == "__main__":
    success = train_malware_detection_model()
    if success:
        print("\n✅ Training pipeline completed successfully!")
    else:
        print("\n❌ Training pipeline failed!")


ImportError: cannot import name '_to_object_array'

In [None]:
!pip install --upgrade numpy scikit-learn


In [11]:
import pandas as pd
import joblib
import xgboost as xgb
import re
import os

def load_feature_names(path):
    with open(path, 'r') as f:
        features = [line.strip() for line in f.readlines()]
    return features

def extract_features_from_log(log_file_path, feature_names):
    """
    Parse the log file and count occurrences of each lifecycle/callback method.
    The feature names correspond to lifecycle or callback method names.

    Args:
        log_file_path (str): Path to the raw log file.
        feature_names (list of str): List of feature names expected (e.g., 'onCreate', 'onPause', etc.)

    Returns:
        pd.DataFrame: One-row DataFrame where columns are feature names and values are counts.
    """
    
    # Initialize counts to zeros
    feature_counts = {feature: 0 for feature in feature_names}
    
    # Compile regex patterns for features to speed up search (assuming feature names are method names)
    feature_patterns = {feature: re.compile(r'\b{}\b'.format(re.escape(feature))) for feature in feature_names}
    
    with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:
            for feature, pattern in feature_patterns.items():
                if pattern.search(line):
                    feature_counts[feature] += 1
    
    # Create a DataFrame with a single row
    df_features = pd.DataFrame([feature_counts], columns=feature_names)
    return df_features

def test_model_with_log(log_file_path):
    # Paths to saved artifacts - update paths accordingly
    model_path = '/kaggle/input/detect/scikitlearn/default/1/detect_malware_1.pkl'
    scaler_path = '/kaggle/input/scaler/scikitlearn/default/1/scaler_1.pkl'
    feature_names_path = '/kaggle/input/feature-names/feature_names.txt'
    
    # Check files exist
    for path in [model_path, scaler_path, feature_names_path]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Required file not found: {path}")
    
    # Load feature names
    feature_names = load_feature_names(feature_names_path)
    print(f"Loaded {len(feature_names)} feature names.")
    
    # Extract features from log file
    X_new = extract_features_from_log(log_file_path, feature_names)
    print(f"Extracted features from log file with shape: {X_new.shape}")
    
    # Handle any non-numeric columns (unlikely but safe)
    for col in X_new.columns:
        if X_new[col].dtype == 'object':
            X_new[col] = pd.to_numeric(X_new[col], errors='coerce').fillna(0)
    
    # Load scaler and transform
    scaler = joblib.load(scaler_path)
    X_new_scaled = scaler.transform(X_new)
    
    # Load trained XGBoost model and predict
    model = xgb.XGBClassifier()
    model.load_model(model_path)
    
    prediction = model.predict(X_new_scaled)
    prediction_proba = model.predict_proba(X_new_scaled)
    
    return prediction[0], prediction_proba[0]

if __name__ == "__main__":
    # Example usage - update path to your log file here
    log_path = '/kaggle/input/security-report/security_report.log'
    
    try:
        pred, proba = test_model_with_log(log_path)
        print(f"Prediction: {pred}")
        print(f"Class probabilities: {proba}")
    except Exception as e:
        print(f"Error during testing: {e}")


Loaded 177 feature names.
Extracted features from log file with shape: (1, 177)
Error during testing: cannot import name '_incremental_weighted_mean_and_var'


In [None]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Load trained model and scaler
model = joblib.load('/kaggle/input/malware_detection/scikitlearn/default/1/detect_malware.pkl')
src_txt_path = '/kaggle/input/malicious80/src.txt'
# If you saved the scaler during training, load it here (recommended)
# scaler = joblib.load('scaler.pkl')
# But for now, we'll just recreate and fit it on training data again:
df = pd.read_csv('/kaggle/input/dataw2/benchmark_ml_features.csv')
X_train = df.drop(columns=["App", "Label"])
scaler = StandardScaler()
scaler.fit(X_train)

# Step 2: Parse src.txt to count API calls
def parse_src_txt(file_path):
    api_counts = {}
    with open(file_path, "r") as f:
        for line in f:
            api = line.strip()
            if api:
                api_counts[api] = api_counts.get(api, 0) + 1
    return api_counts

api_count = parse_src_txt(src_txt_path)

# Step 3: Align features to training data
feature_names = list(X_train.columns)
app_features = [api_count.get(api, 0) for api in feature_names]
X_input = pd.DataFrame([app_features], columns=feature_names)

# Step 4: Scale
X_input_scaled = scaler.transform(X_input)

# Step 5: Predict
pred_class = model.predict(X_input_scaled)[0]
pred_proba = model.predict_proba(X_input_scaled)[0][1]

# Optional: Use custom threshold
custom_threshold = 0.5
final_prediction = "Malicious" if pred_proba >= custom_threshold else "Benign"

# Output
print("Prediction:", final_prediction)
print("Confidence (Malicious):", f"{pred_proba:.4f}")


In [None]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler

# === CONFIGURATION ===
SRC_FILE_PATH = "/kaggle/input/malicious80/src.txt"  # 🔁 Replace this dynamically below
MODEL_PATH = "/kaggle/input/malware_detection/scikitlearn/default/1/detect_malware.pkl"
TRAIN_CSV = "/kaggle/input/dataw2/benchmark_ml_features.csv"

# === 1. Load and Parse src.txt ===
def parse_src_txt_tab_sep(file_path):
    df = pd.read_csv(file_path, sep="\t", header=None)
    method_counts = df[0].value_counts().to_dict()
    return method_counts

api_counts = parse_src_txt_tab_sep(SRC_FILE_PATH)

# === 2. Align with Training Features ===
train_df = pd.read_csv(TRAIN_CSV)
feature_columns = [col for col in train_df.columns if col not in ["App", "Label"]]
app_features = [api_counts.get(col, 0) for col in feature_columns]
X_input = pd.DataFrame([app_features], columns=feature_columns)

# === 3. Scale ===
scaler = StandardScaler()
scaler.fit(train_df[feature_columns])
X_scaled = scaler.transform(X_input)

# === 4. Load Model and Predict ===
model = joblib.load(MODEL_PATH)
pred = model.predict(X_scaled)[0]
proba = model.predict_proba(X_scaled)[0][1]

# === 5. Output ===
print("\n--- 🔍 Prediction Result ---")
print(f"Prediction     : {'Malicious' if proba >= 0.5 else 'Benign'}")
print(f"Confidence     : {proba:.4f}")


In [None]:
from datetime import datetime

print("last update: {}".format(datetime.now())) 