# Categorising repositories

This notebook is an attempt to categorise repositories and build classifiers.

In [None]:
from oss4energy.src.parsers.opensustain_tech import (
    fetch_categorised_projects_from_opensustain_webpage,
)
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

categorised_repos = fetch_categorised_projects_from_opensustain_webpage()

In [None]:
pprint(list(categorised_repos.keys()))

In [None]:
pprint({k: list(v.keys()) for k, v in categorised_repos.items()})

## Working out manual categories

In [None]:
# Making manual categories to start with
ENERGY = ["Energy Storage", "Energy Systems", "Renewable Energy"]
EARTH_SCIENCE = [
    "Atmosphere",
    "Hydrosphere",
    "Cryosphere",
    {"Climate Change": ["Earth and Climate Modeling"]},
]

In [None]:
from oss4energy.src.parsers import identify_parsing_targets


def f_aggregate_to_list(
    repo_dict, path_labels: list[str | dict[str, list[str]]] | None = None
):
    # If not provided, the labels are set so that the whole data is imported
    if path_labels is None:
        path_labels = list(repo_dict.keys())

    out = list()
    for i in path_labels:
        sub_categories = []
        if isinstance(i, str):
            i_out = i
            sub_categories = list(repo_dict.get(i).keys())
        elif isinstance(i, dict):
            i_out = list(i.keys())[0]
            sub_categories = list(i.values())[0]

        for c in sub_categories:
            out += repo_dict.get(i_out).get(c)

    return identify_parsing_targets(out).as_url_list()

In [None]:
print(
    f"""Testing:
    - ENERGY: {len(f_aggregate_to_list(categorised_repos, ENERGY))}
    - EARTH_SCIENCE: {len(f_aggregate_to_list(categorised_repos, EARTH_SCIENCE))}
    """
)

### Building up a series of classifiers

In [None]:
from oss4energy.scripts.listing_search import FILE_OUTPUT_LISTING_FEATHER, SearchResults
import pandas as pd

res = SearchResults("../" + FILE_OUTPUT_LISTING_FEATHER)

In [None]:
df_docs = res.documents.set_index("id")
print(len(df_docs))

In [None]:
df_docs.head(2)

### Adding categories 

In [None]:
repos_listed = df_docs["url"].to_list()
all_opensustain_repos = f_aggregate_to_list(categorised_repos)
energy_repos = f_aggregate_to_list(categorised_repos, ENERGY)
earth_science_repos = f_aggregate_to_list(categorised_repos, EARTH_SCIENCE)


def _f_in_list(x) -> bool:
    return x in repos_listed

In [None]:
# Adding labels
category_col = "category"
df_docs["idx"] = df_docs["url"]
df_docs.set_index("idx", inplace=True)
df_docs[category_col] = "?"
df_docs.loc[list(filter(_f_in_list, all_opensustain_repos)), category_col] = "OTHER"
df_docs.loc[list(filter(_f_in_list, energy_repos)), category_col] = "ENERGY"
df_docs.loc[list(filter(_f_in_list, earth_science_repos)), category_col] = (
    "EARTH_SCIENCE"
)

In [None]:
df_docs.head(2)

### Training classifier

Tips from https://scikit-learn.org/1.5/auto_examples/text/plot_document_classification_20newsgroups.html

In [None]:
x_selected = ["OTHER", "ENERGY", "EARTH_SCIENCE"]

df4training = df_docs[df_docs[category_col].apply(lambda x: x in x_selected)].copy()
df4training = df4training[df4training["description"].apply(lambda x: x is not None)]
df4training[[category_col, "name"]].groupby(category_col).count()

In [None]:
# For now, training on full dataset and not cleaning up words (VERY DIRTY)
vectorizer = TfidfVectorizer(
    sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)

In [None]:
x_full = vectorizer.fit_transform(df4training["description"])
# x_full = vectorizer.fit_transform(df4training["readme"])
y_full = df4training[category_col]

X_train, X_test, y_train, y_test = train_test_split(
    x_full, y_full, test_size=0.4, random_state=42
)

#### Trying out ridge classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)

In [None]:
y_predicted_on_test = clf.predict(X_test)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_predicted_on_test, ax=ax)
_ = ax.set_title(
    f"Confusion Matrix for {clf.__class__.__name__}\non the test documents"
)

In [None]:
# COpy/pasted from https://scikit-learn.org/1.5/auto_examples/text/plot_document_classification_20newsgroups.html

categories = x_selected
target_names = x_selected


def plot_feature_effects():
    # learned coefficients weighted by frequency of appearance
    average_feature_effects = clf.coef_ * np.asarray(X_train.mean(axis=0)).ravel()

    for i, label in enumerate(target_names):
        top5 = np.argsort(average_feature_effects[i])[-5:][::-1]
        if i == 0:
            top = pd.DataFrame(feature_names[top5], columns=[label])
            top_indices = top5
        else:
            top[label] = feature_names[top5]
            top_indices = np.concatenate((top_indices, top5), axis=None)
    top_indices = np.unique(top_indices)
    predictive_words = feature_names[top_indices]

    # plot feature effects
    bar_size = 0.25
    padding = 0.75
    y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)

    fig, ax = plt.subplots(figsize=(10, 8))
    for i, label in enumerate(target_names):
        ax.barh(
            y_locs + (i - 2) * bar_size,
            average_feature_effects[i, top_indices],
            height=bar_size,
            label=label,
        )
    ax.set(
        yticks=y_locs,
        yticklabels=predictive_words,
        ylim=[
            0 - 4 * bar_size,
            len(top_indices) * (4 * bar_size + padding) - 4 * bar_size,
        ],
    )
    ax.legend(loc="lower right")

    print("top 5 keywords per class:")
    print(top)

    return ax


_ = plot_feature_effects().set_title("Average feature effect on the original data")

For future work:
- [ ] Augment the original dataset with the categorical columns, where available
- [ ] Explore the performance of different classifiers (bag of words is pretty naive)