In [106]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [107]:
n = 1000

In [108]:
df_codex = pd.read_csv("data/security_smells_AWS_Codex.csv")
df_codex["model"] = "codex"
print(len(df_codex))
df_codex = df_codex.sample(n=n, replace=True)
df_gpt_2 = pd.read_csv("data/security_smells_AWS_CodeParrot.csv")
df_gpt_2["model"] = "gpt-2"
df_gpt_2 = df_gpt_2.sample(n=n, replace=True)
df_parrot = pd.read_csv("data/security_smells_AWS_CodeParrot.csv")
df_parrot["model"] = "code_parrot"
df_parrot = df_parrot.sample(n=n, replace=True)
df_github = pd.read_csv("data/security_smells_github.csv")
print(len(df_github))
df_github["model"] = "github"
df_github = df_github.sample(n=n, replace=True)
df_chat_gpt = pd.read_csv("data/security_smells_AWS_ChatGPT.csv")
df_chat_gpt["model"] = "chat_gpt"
df_chat_gpt = df_chat_gpt.sample(n=n, replace=True)
provider = "AWS"
model = "Codex"

2186
3793


In [109]:
print(f"Codex {len(df_codex)} \n Code Parrot {len(df_parrot)} \n GPT-2 {len(df_gpt_2)} \n GitHub {len(df_github)}")

Codex 1000 
 Code Parrot 1000 
 GPT-2 1000 
 GitHub 1000


In [110]:
number_files = df_codex["file"].nunique()

In [111]:
len(df_codex[df_codex["severity"].str.contains("CRITICAL")])/number_files

0.4847715736040609

In [112]:
len(df_codex[df_codex["severity"].str.contains("HIGH")])/number_files

1.0507614213197969

In [113]:
df = pd.concat([df_codex, df_github, df_parrot, df_gpt_2])

In [114]:
fig_codex = px.histogram(
    df_codex,
    x="severity",
    category_orders=dict(severity=["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
    title=f"Security Severity Distribution for {provider} using {model}",
    color="model"
)

fig_github = px.histogram(
    df_github,
    x="severity",
    category_orders=dict(severity=["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
    title=f"Security Severity Distribution for {provider} using {model}",
    color_discrete_sequence=["rgba(253,184,30,1)"],
    color=px.Constant("GitHub")
)

fig_parrot = px.histogram(
    df_parrot,
    x="severity",
    category_orders=dict(severity=["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
    title=f"Security Severity Distribution for {provider} using {model}",
    color_discrete_sequence=["rgba(113,184,30,1)"],
    color=px.Constant("Parrot")
)

fig_gpt = px.histogram(
    df_gpt_2,
    x="severity",
    category_orders=dict(severity=["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
    title=f"Security Severity Distribution for {provider} using {model}",
    color_discrete_sequence=["rgba(253,184,130,1)"],
    color=px.Constant("GPT-2")
)

fig_chat_gpt = px.histogram(
    df_chat_gpt,
    x="severity",
    category_orders=dict(severity=["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
    title=f"Security Severity Distribution for {provider} using {model}",
    color_discrete_sequence=["rgba(243,14,30,1)"],
    color=px.Constant("ChatGPT")
)



fig = go.Figure(data = fig_codex.data + fig_github.data +  fig_parrot.data + fig_gpt.data + fig_chat_gpt.data)
fig.show()

In [115]:
fig.write_image("media/severity_distribution_comparison.png")

In [118]:
percentage_files_affected_df = pd.DataFrame(columns=["model", "percentage"])
percentage_files_affected_df = percentage_files_affected_df.append({"model": "GitHub", "percentage": 0.29}, ignore_index=True)
percentage_files_affected_df = percentage_files_affected_df.append({"model": "ChatGPT", "percentage":0.7833}, ignore_index=True)
percentage_files_affected_df = percentage_files_affected_df.append({"model": "Codex", "percentage":0.8915}, ignore_index=True)
percentage_files_affected_df = percentage_files_affected_df.append({"model": "CodeParrot", "percentage":0.8974}, ignore_index=True)
percentage_files_affected_df = percentage_files_affected_df.append({"model": "GPT-2", "percentage":1}, ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [119]:
fig = px.bar(percentage_files_affected_df, x="model", y="percentage")
fig.update_xaxes(title="Model")
fig.update_yaxes(title="Percentage of files >= 1 security issue")
fig.show()

In [120]:
fig.write_image("media/percentage_files_affected.png")
fig.write_image("media/percentage_files_affected.svg")