# Categorising repositories

This notebook is an attempt to categorise repositories and build classifiers.

In [None]:
from oss4energy.src.parsers.opensustain_tech import fetch_categorised_projects_from_from_opensustain_webpage
from pprint import pprint

categorised_repos = fetch_categorised_projects_from_from_opensustain_webpage()

In [None]:
pprint(list(categorised_repos.keys()))

In [None]:
pprint({k: list(v.keys()) for k, v in categorised_repos.items()})

## Working out manual categories

In [None]:
# Making manual categories to start with
ENERGY = ["Energy Storage", "Energy Systems", "Renewable Energy"]
EARTH_SCIENCE = ["Atmosphere", "Hydrosphere", "Cryosphere", {'Climate Change': ['Earth and Climate Modeling']}]

In [None]:
from oss4energy.src.parsers import identify_parsing_targets

def f_aggregate_to_list(repo_dict, path_labels: list[str|dict[str,list[str]]] | None = None):
    # If not provided, the labels are set so that the whole data is imported
    if path_labels is None:
        path_labels = list(repo_dict.keys())
    
    out = list()
    for i in path_labels:
        sub_categories = []
        if isinstance(i, str):
            i_out = i
            sub_categories = list(repo_dict.get(i).keys())
        elif isinstance(i, dict):
            i_out = list(i.keys())[0]
            sub_categories = list(i.values())[0]
            
        for c in sub_categories:
            out += repo_dict.get(i_out).get(c)

    return identify_parsing_targets(out).as_list()

In [None]:
print(
    f"""Testing:
    - ENERGY: {len(f_aggregate_to_list(categorised_repos, ENERGY))}
    - EARTH_SCIENCE: {len(f_aggregate_to_list(categorised_repos, EARTH_SCIENCE))}
    """
)

### Building up a series of classifiers

In [None]:
from oss4energy.scripts.listing_search import FILE_OUTPUT_LISTING_FEATHER, SearchResults
import pandas as pd

res = SearchResults("../"+FILE_OUTPUT_LISTING_FEATHER)

In [None]:
df_docs = res.documents.set_index("id")
print(len(df_docs))

In [None]:
df_docs.head(2)

### Adding categories 

In [None]:
repos_listed = df_docs["url"].to_list()
all_opensustain_repos = f_aggregate_to_list(categorised_repos, ENERGY)
energy_repos = f_aggregate_to_list(categorised_repos, ENERGY)
earth_science_repos = f_aggregate_to_list(categorised_repos, EARTH_SCIENCE)

def _f_in_list(x) -> bool:
    return x in repos_listed

In [None]:
# Adding labels
category_col = "category"
df_docs[category_col] = "?"
df_docs.loc[list(filter(_f_in_list, all_opensustain_repos)),category_col] = "OTHER"
df_docs.loc[list(filter(_f_in_list, energy_repos)),category_col] = "ENERGY"
df_docs.loc[list(filter(_f_in_list, earth_science_repos)),category_col] = "EARTH_SCIENCE"

In [None]:
df_docs.head(2)

### Training classifier

Tips from https://scikit-learn.org/1.5/auto_examples/text/plot_document_classification_20newsgroups.html

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

x_selected = ["OTHER", "ENERGY", "EARTH_SCIENCE"]

df4training = df_docs[df_docs[category_col].apply(lambda x: x in x_selected)].copy()

In [None]:
df_docs.tail(30)

In [None]:
df_docs[[category_col, "name"]].groupby(category_col).count()

In [None]:
df4training