In [74]:
# import polars as pd
import pandas as pd


class AnalyzeGraph:
    def __init__(self, path):
        data = []

        if isinstance(path, str):
            with open(path, "r") as f:
                for line in f.readlines():
                    l = line.strip()
                    d = l.split(" ")[1:]
                    d.sort()
                    data.append(d)
        else:
            data = [list(x) for x in path]

        data = sorted(data, key=len, reverse=True)

        # as list
        self.data: list = [[y for y in x] for x in data]
        # as set
        x = [tuple(x) for x in self.data]
        self.set = set(x)

        # as dataframe
        # max_len = max(map(len, data))
        # for i in range(len(data)):
        #     data[i] += [None] * (max_len - len(data[i]))
        self.df = pd.DataFrame(data)

        self._sizes = None

    @property
    def sizes(self) -> pd.DataFrame:
        if self._sizes is None:
            sizes = dict()
            for x in self.data:
                i = len(x)
                if i in sizes.keys():
                    sizes[i] += 1
                else:
                    sizes[i] = 1

            k = [x for x in sizes.keys()]
            v = [sizes[x] for x in k]
            df_d = {"Size": k, "Amount": v}

            self._sizes = pd.DataFrame(df_d)
        return self._sizes

    def cluster_statistics(self) -> pd.DataFrame:
        # round down
        ave = sum(map(len, self.data)) // len(self.data)

        maximum = len(self.data[0])
        minimum = len(self.data[-1])
        no_clusters = len(self.data)

        return pd.DataFrame(
            {
                "Statistic": [
                    "Maximum Cluster Len",
                    "Minimum Cluster Len",
                    "Average Cluster Len",
                    "Number of Clusters"
                ],
                "Value": [maximum, minimum, ave, no_clusters],
            }
        )

    def set_diff(self, other: "AnalyzeGraph") -> "AnalyzeGraph":
        return AnalyzeGraph(self.set - other.set)

    def set_intersection(self, other: "AnalyzeGraph") -> "AnalyzeGraph":
        return AnalyzeGraph(self.set & other.set)

    def save_clusters(self, path: str) -> None:
        self.df.to_csv(path, sep=" ")


In [75]:
test = AnalyzeGraph("Collinsv2.txt")
test.cluster_statistics()

Unnamed: 0,Statistic,Value
0,Maximum Cluster Len,29
1,Minimum Cluster Len,2
2,Average Cluster Len,5
3,Number of Clusters,401


In [76]:
test.df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,YBL027W,YDL082W,YDL136W,YDR012W,YDR382W,YDR418W,YDR471W,YFR031C-A,YGL103W,YGL135W,...,YLR448W,YMR142C,YMR242C,YNL069C,YNL301C,YOL127W,YOR063W,YPL198W,YPL220W,YPL249C-A
1,YBL027W,YDL075W,YDL082W,YDL136W,YDR012W,YDR382W,YDR418W,YDR471W,YFR031C-A,YGL135W,...,YMR142C,YMR242C,YNL069C,YNL301C,YOL127W,YOR063W,YOR234C,YPL198W,YPL220W,YPL249C-A
2,YBL027W,YDL082W,YDL136W,YDR012W,YDR382W,YDR418W,YDR471W,YFR031C-A,YGL030W,YGL135W,...,YNL069C,YNL301C,YOL127W,YOR063W,YPL198W,YPL220W,YPL249C-A,,,
3,YBL027W,YDL075W,YDL082W,YDL136W,YDR012W,YDR382W,YDR418W,YDR471W,YFR031C-A,YGL135W,...,YNL069C,YNL301C,YOL127W,YOR063W,YPL198W,YPL220W,YPL249C-A,,,
4,YBL027W,YDL082W,YDL136W,YDR012W,YDR382W,YDR418W,YDR471W,YFR031C-A,YGL135W,YGR034W,...,YNL069C,YNL301C,YOL127W,YOR063W,YPL198W,YPL220W,YPL249C-A,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,YDR092W,YGL087C,,,,,,,,,...,,,,,,,,,,
397,YJR104C,YMR038C,,,,,,,,,...,,,,,,,,,,
398,YJR052W,YPL046C,,,,,,,,,...,,,,,,,,,,
399,YLR270W,YOR173W,,,,,,,,,...,,,,,,,,,,


In [77]:
def csv_to_longtable(csv_path: str, caption: str, save_path: str = None, cont_annotation: str = "") -> str:
    latex_lines = [
        "\\setlength{\\extrarowheight}{2pt}",
        "\\renewcommand{\\arraystretch}{1.2}",
        "\\begin{longtable}{| m{27em} |}",
        f"\\caption{{{caption}}} \\\\",
        "\\hline",
        "\\textbf{Cluster} \\\\",
        "\\hline",
        "\\endfirsthead",
        "\\hline",
        f"\\textbf{{Cluster (cont. {cont_annotation})}} \\\\",
        "\\hline",
        "\\endhead",
        "\\hline",
        "\\endfoot",
        "\\hline",
        "\\endlastfoot"
    ]

    with open(csv_path, 'r') as f:
        lines = f.readlines()

    for line in lines[1:]:  # Skip header
        x = " ".join(line.strip().split(" ")[1:])  # Remove newline and keep 2nd+ column
        latex_lines.append(f"{x} \\\\")
        latex_lines.append("\\hline")

    latex_lines.append("\\end{longtable}")

    latex_output = '\n'.join(latex_lines)

    if save_path:
        with open(save_path, 'w') as f:
            f.write(latex_output)

    return latex_output


In [78]:
import subprocess
def test_suite(graph: str) -> None:
    subprocess.run(["mkdir", "-p", graph])

    def p(x: str): return f"{graph}/{x}.txt"

    v1 = AnalyzeGraph(graph+"v1.txt")
    v2 = AnalyzeGraph(graph+"v2.txt")
    v3 = AnalyzeGraph(graph+"v3.txt")


    # NOTE: .to_csv() is customizable

    # base graphs
    v1.cluster_statistics().to_csv(p("v1"))
    v2.cluster_statistics().to_csv(p("v2"))
    v3.cluster_statistics().to_csv(p("v3"))

    # intersections
    v1.set_intersection(v2).cluster_statistics().to_csv(p("v1&v2"))
    v1.set_intersection(v3).cluster_statistics().to_csv(p("v1&v3"))
    v2.set_intersection(v3).cluster_statistics().to_csv(p("v2&v3"))
    v2.set_intersection(v3).set_intersection(v1).cluster_statistics().to_csv(p("v1&v2&v3"))

    # set diff
    data: tuple[str, AnalyzeGraph] = [
        ("v1", v1),
        ("v2", v2),
        ("v3", v3)
    ]

    for _ in range(3):
        x = data.pop()
        for d in data:
            name = f"{x[0]}-{d[0]}"
            analyze = x[1].set_diff(d[1])
            analyze.cluster_statistics().to_csv(p(name))

            cluster_path = p(name+"_clusters")
            analyze.save_clusters(cluster_path)
            caption = f"Specific clusters found in {graph}\\textsubscript{{{x[0][1]}}}, but not in {graph}\\textsubscript{{{d[0][1]}}}"
            caption2 = f"{graph}\\textsubscript{{{x[0][1]}}} - {graph}\\textsubscript{{{d[0][1]}}}"
            # csv_to_latex_table(cluster_path, f"{graph}/{name}.tex", )
            csv_to_longtable(cluster_path, caption, f"{graph}/{name}.tex", caption2)
        data = [x] + data


In [79]:
test_suite("Collins")
test_suite("Gavin")
test_suite("KroganExt")
test_suite("KroganCore")