In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.io import output_notebook, curdoc
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import VBar, FactorRange
from bokeh.themes import built_in_themes
from pandas_profiling import ProfileReport
from bokeh.transform import dodge

output_notebook()
curdoc().theme = 'dark_minimal'


In [None]:
def plot_hist(df: pd.DataFrame, column: str, plot: figure, colors=["SteelBlue", "Tan"]) -> None:
    hist, edges = np.histogram(df[column])
    hist_df = pd.DataFrame({column: hist,
                        "left": edges[:-1],
                        "right": edges[1:]})
    hist_df["interval"] = ["%d to %d" % (left, right) for left, 
                        right in zip(hist_df["left"], hist_df["right"])]
    src = ColumnDataSource(hist_df)
    plot.vbar(bottom = 0, top=column, source = src, fill_color = colors[0], 
        line_color = "black", fill_alpha = 0.5,
        hover_fill_alpha = 1.0, hover_fill_color = colors[1])

In [None]:
infected_df = pd.read_csv("./trainSample.csv")
def fill_nan_with(df: pd.DataFrame, column: str, value):
    df[column].loc[pd.isna(df[column])] = value
for column in ["AVProductsInstalled", "IsProtected", "AVProductsEnabled"]:
    fill_nan_with(infected_df, column, 0)

In [None]:
default_columns = [
    "MachineIdentifier",
    "HasDetections"
]

red_columns = [
    "EngineVersion",
    "AppVersion",
    "AvSigVersion",
    "AVProductStatesIdentifier",
    "AVProductsInstalled",
    "AVProductsEnabled",
    "HasTpm",
    "CountryIdentifier",
    "CityIdentifier",
    "OrganizationIdentifier",
    "GeoNameIdentifier",
    "LocaleEnglishNameIdentifier",
    "Platform",
    "Processor",
    "OsVer",
    "OsBuild",
    "OsSuite",
    "OsPlatformSubRelease",
    "OsBuildLab",
    "SkuEdition",
    "IsProtected",
    "SMode",
    "IeVerIdentifier",
    "SmartScreen",
    "Firewall"
]

In [None]:
test = infected_df.describe(include=["object"])

In [None]:
is_infected = infected_df["HasDetections"] == 1

In [None]:
infected_df["EngineVersionNumberfied"] = infected_df["EngineVersion"].str.replace('.', '').astype(int)

In [None]:
#profiling = ProfileReport(infected_df[default_columns + red_columns])
#profiling.to_file(output_file="report.html")


In [None]:
def vbar(df: pd.DataFrame, column: str, fill_color: str, dodge_distance: float, x_range: FactorRange, width: float=0.25, fill_alpha: float=0.5) -> VBar:
    column_series = df[column]
    x = list(set(column_series.astype(str).values))
    y = column_series.value_counts().sort_index()
    return ColumnDataSource({"x": x, "y": y}), VBar(x=dodge("x", dodge_distance, x_range), top="y", fill_color=fill_color, width=width, fill_alpha=fill_alpha)

In [None]:
p = figure(x_range=FactorRange(factors=list(set(infected_df["IsProtected"].astype(str).sort_values().values))))
is_protected = infected_df["IsProtected"] == 0
p.add_glyph(*vbar(infected_df[~is_infected], "IsProtected", "green", -0.13, p.x_range))
p.add_glyph(*vbar(infected_df[is_infected], "IsProtected", "red", 0.13, p.x_range))
show(p)

In [None]:
from scipy.stats import chisquare
chisquare([infected_df[~is_infected][is_protected]["IsProtected"].count(), infected_df[is_infected][is_protected]["IsProtected"].count()], 
f_exp=[len(infected_df[is_protected]) * 0.5, len(infected_df[is_protected]) * 0.5])

In [None]:
p = figure(plot_width=1400, x_range=FactorRange(factors=list(set(infected_df["EngineVersion"].astype(str).values))))
p.add_glyph(*vbar(infected_df[~is_infected], "EngineVersion", "green", -0.13, p.x_range))
p.add_glyph(*vbar(infected_df[is_infected], "EngineVersion", "red", 0.13, p.x_range))
show(p)

In [None]:
infected_df["EngineVersion"].map(lambda x: len(max(x.split("."), key=len))).max()

In [None]:
from typing import List

def add_digits_from_list(version_list: List[str], add_digits: int) -> int:
    return int(''.join([x.zfill(add_digits) for x in version_list]))
infected_df["EngineVersion"].map(lambda x: add_digits_from_list(x.split("."), 2))

In [None]:
"23432434".zfill(2)

In [None]:
from transformation_utils.transformers import VersionToNum
numberfyer = VersionToNum("EngineVersion")
infected_df["EngineVersionNumberfied"] = numberfyer.fit_transform(infected_df)

In [None]:
p = figure(plot_width=1400, x_range=FactorRange(factors=infected_df["EngineVersionNumberfied"].sort_values().unique().astype(str)))
p.add_glyph(*vbar(infected_df[~is_infected], "EngineVersionNumberfied", "green", -0.13, p.x_range))
p.add_glyph(*vbar(infected_df[is_infected], "EngineVersionNumberfied", "red", 0.13, p.x_range))
show(p)