In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("../../base_df.csv")

In [None]:
# Stats
# Count projects
print("Number of projects:", len(df[["project name"]].drop_duplicates()))
# List of projects
print("Project names:\n", "\n".join([i[0] for i in df[["project name"]].drop_duplicates().values]), sep="")
# Count unique commits
print("Number of unique commits:", len(df[["project name", "commit hash"]].drop_duplicates()))
# Count cache tuples
print("Number of unique cache tuples:", len(df[["project name", "commit hash", "coq version"]].drop_duplicates()))
# Count executed proofs
df1 = df[["project name", "commit hash", "coq version", "filename", "command index", "proof index"]]
df1_proofs_only = df1[df1["proof index"] != 0]
print("Number of executed proofs:", len(df1_proofs_only.drop_duplicates()))
# Count successfully executed sentences
print("Number of successfully executed sentences:", len(df))
# Count compiled files
df3 = df[["project name", "commit hash", "coq version", "filename"]]
print("Number of compiled files:", len(df3.drop_duplicates()))
# Get max open goals and histogram of open goals
df["total goal count"] = df["foreground goal count"] + df["background goal count"] + df["abandoned goal count"] + df["shelved goal count"]
print("Maximum total open goal count:", df["total goal count"].max())
# DF with proof sentences only
df["total hypothesis count"] = df["foreground hypothesis count"] + df["background hypothesis count"] + df["abandoned hypothesis count"] + df["shelved hypothesis count"]

In [None]:
# Histogram
df_proofs_only = df[df["proof index"] != 0]
fig = plt.figure()
sns.histplot(data=df_proofs_only[df_proofs_only["total goal count"] > 0], x="total goal count", stat="proportion", discrete=True)
ax = fig.axes[0]
ax.set_xticks(range(0, 20, 2))
ax.set_xlabel("Number of open goals")
# ax.set_ylabel("Proportion of occurrences")
# Get histogram of open hypotheses
fig = plt.figure()
sns.histplot(data=df_proofs_only[df_proofs_only["total hypothesis count"] > 0], x="total hypothesis count", stat="proportion", discrete=False, binrange=[0, 200], binwidth=10)
ax = fig.axes[0]
# ax.set_xticks(range(0, 201, 25))
# ax.set_xlim(-25, 225)
ax.set_xlabel("Number of hypotheses")
# ax.set_ylabel("Number of occurrences")

In [None]:
print("Sentence counts. The count column here also shows the number of cache tuples per project.")
df.value_counts(subset=["project name", "commit hash"]).groupby("project name").describe()
# The below histgorams look like crap. Not enough unique bins. Better to use the table above
# project_names = list(df["project name"].unique())
# for project_name in project_names:
#     plt.figure()
#     df[df["project name"] == project_name].value_counts(subset=["project name", "commit hash", "coq version"]).groupby("project name").hist()
# TODO: Box and whisker plot, maybe

In [None]:
print("Ignore: Box and whisker plots for sentence counts over commits and coq versions")
# boxplot_series = df.value_counts(subset=["project name", "commit hash"])
# boxplot_crosstab = pd.crosstab(df["commit hash"], df["project name"])
# print(boxplot_crosstab.melt())
# sns.boxplot(data=boxplot_crosstab.melt(), x="project name", y="value", showfliers=False)
# plt.xticks(rotation=45)
# plt.ylabel("Sentence count")

In [None]:
print("Proof counts. Projects coq-http and coq-simple-io don't have any proofs in the successfully extracted commits.")
df.loc[df["proof index"] > 0].loc[df["proof step index"] == 0].value_counts(subset=["project name", "commit hash", "coq version"]).groupby("project name").describe()

In [None]:
print("Files per project.")
df[["project name", "commit hash", "filename"]].drop_duplicates().value_counts(subset=["project name", "commit hash"]).groupby("project name").describe()

In [None]:
print("Command type occurrences.")
barplot_series = df.value_counts(subset="command type")
print(barplot_series)
series_largest = barplot_series.nlargest(15)
sns.barplot(y=series_largest.index, x=series_largest.values)
plt.xlabel("Counts")
plt.ylabel("Sentence Type")

In [None]:
print("Ignore: Command type occurrences grouped by project")
# If we really want to show all this, save it as a CSV file. The output of describe() is itself a dataframe.
# df.value_counts(subset=["project name", "commit hash", "coq version", "command type"]).groupby(["project name", "command type"]).describe()

In [None]:
print("Command type occurrences, proof sentences only")
barplot_series = df_proofs_only.value_counts(subset="command type")
print(barplot_series)
sns.barplot(y=barplot_series.index, x=barplot_series.values)
plt.xlabel("Counts")
plt.ylabel("Sentence Type")

In [None]:
print("Command type occurrences, proof sentences only, group by project.")
df_proofs_only.value_counts(subset=["project name", "commit hash", "coq version", "command type"]).groupby(["project name", "command type"]).describe()

In [None]:
print("Command type occurrences, non-proof sentences only")
barplot_series = df[df["proof index"] == 0].value_counts(subset="command type")
print(barplot_series)
series_largest = barplot_series.nlargest(15)
sns.barplot(y=series_largest.index, x=series_largest.values)
plt.xlabel("Counts")
plt.ylabel("Sentence Type")

In [None]:
# Goal Histogram bonanza
# project_names = list(df["project name"].unique())
# for project_name in project_names:
#     fig = plt.figure()
#     sns.histplot(data=df_proofs_only[df_proofs_only["total goal count"] > 0].loc[df_proofs_only["project name"] == project_name], x="total goal count", stat="proportion", discrete=True)
#     ax = fig.axes[0]
#     ax.set_xticks(range(0, 20, 2))
#     ax.set_xlabel("Number of open goals")
#     ax.set_title(project_name)

In [None]:
# Hypothesis Histogram bonanza
# project_names = list(df["project name"].unique())
# for project_name in project_names:
#     fig = plt.figure()
#     sns.histplot(data=df_proofs_only[df_proofs_only["total hypothesis count"] > 0].loc[df_proofs_only["project name"] == project_name], x="total hypothesis count", stat="proportion", discrete=True, binrange=[0, 200])
#     ax = fig.axes[0]
#     ax.set_xlabel("Number of open hypotheses")
#     ax.set_title(project_name)

In [None]:
print("Sum over maximum unique proofs per project")
per_project_max_unique_proofs = df_proofs_only[["project name", "commit hash", "coq version", "command index", "proof index"]].drop_duplicates().value_counts(subset=["project name", "commit hash", "coq version"])
# print(per_project_max_unique_proofs)
print(per_project_max_unique_proofs.groupby("project name").max().sum())