<a href="https://colab.research.google.com/github/ShirishaReddyV/AI-Sandbox/blob/main/Android_malware_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
from scipy.stats import zscore
import numpy as np

# Step 1: Download dataset
path = kagglehub.dataset_download("subhajournal/android-malware-detection")
print("Path to dataset files:", path)

# Step 2: Load dataset (assuming CSV format, adjust if it's XLSX or others)
import os
# Find first CSV file in directory
csv_file = None
for file in os.listdir(path):
    if file.endswith(".csv"):
        csv_file = os.path.join(path, file)
        break

if csv_file is None:
    raise FileNotFoundError("No CSV file found in the downloaded dataset.")

df = pd.read_csv(csv_file)
print("Dataset shape:", df.shape)

# Step 3: Select only numerical columns
numeric_cols = df.select_dtypes(include=np.number).columns
df_numeric = df[numeric_cols]

# Step 4: Calculate Z-scores
z_scores = df_numeric.apply(zscore)

# Step 5: Detect outliers
outliers = (z_scores > 2.5) | (z_scores < -2.5)

# Summary of outliers per column
outlier_summary = outliers.sum().sort_values(ascending=False)

print("\nNumber of outliers per numerical column (Z-score > 2.5 or < -2.5):")
print(outlier_summary)

# Optional: Show rows with any outliers
df_outliers = df[outliers.any(axis=1)]
print("\nRows with any outliers:")
print(df_outliers.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (16, 6)

# Top 5 numerical columns with most outliers
top_outlier_cols = outlier_summary.head(5).index

# 1. Side-by-side boxplots
fig, axes = plt.subplots(1, len(top_outlier_cols), figsize=(20, 6))

for i, col in enumerate(top_outlier_cols):
    sns.boxplot(data=df, x=col, ax=axes[i])
    axes[i].set_title(f'Boxplot: {col}', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45)

plt.suptitle('Top 5 Numerical Columns with Outliers (Boxplots)', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# 2. Scatter plot for first 2 columns with most outliers
if len(top_outlier_cols) >= 2:
    x_col = top_outlier_cols[0]
    y_col = top_outlier_cols[1]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=x_col, y=y_col, label='Normal', alpha=0.5)
    sns.scatterplot(data=df_outliers, x=x_col, y=y_col, color='red', label='Outliers', alpha=0.7)
    plt.title(f'Scatter Plot Highlighting Outliers: {x_col} vs {y_col}', fontsize=14)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.legend()
    plt.tight_layout()
    plt.show()
