In [1]:
import pandas as pd
import os
import pickle as pkl
from plotly import express as px
from lightgbm import LGBMRegressor
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.model.utils import qwk_obj

In [4]:
df = pd.DataFrame()
names = None

for fold in range(config.lgbm_n_folds):
    model: LGBMRegressor = pkl.load(open(f"output/LGBM/{fold}.pkl", "rb"))["model"]
    importances = model.feature_importances_

    if not names:
        names = model.feature_name_
        df = pd.concat([df, pd.DataFrame({"name": names})], axis=1)

    temp = pd.DataFrame({f"LGBM_{fold}": importances})
    df = pd.concat([df, temp], axis=1)

In [5]:
df.head(10)

Unnamed: 0,name,LGBM_0,LGBM_1,LGBM_2,LGBM_3,LGBM_4,LGBM_5,LGBM_6
0,paragraph_count,53,9,41,5,32,30,46
1,paragraph_sentenceCount_sum,38,7,32,9,21,17,19
2,paragraph_sentenceCount_min,10,1,12,0,8,13,18
3,paragraph_sentenceCount_mean,69,28,58,18,56,60,62
4,paragraph_sentenceCount_max,43,1,36,3,21,22,34
5,paragraph_wordCount_sum,99,36,83,43,84,69,68
6,paragraph_wordCount_min,65,13,64,6,56,50,51
7,paragraph_wordCount_mean,117,55,102,27,101,103,109
8,paragraph_wordCount_max,96,23,62,6,51,50,65
9,paragraph_lengths_sum,100,57,99,30,72,79,79


In [6]:
df["importance"] = df.loc[:, "LGBM_0":].median(axis=1)
df.sort_values(by="importance", ascending=False, inplace=True)
df = df[["name", "importance"]]

In [7]:
def get_broad_group(name):
    group = name.split("_")[0]

    if group == "tfidf" and name.split("_")[1] == "count":
        return "tfidf_count"

    return group

In [8]:
df["broad_group"] = df["name"].map(get_broad_group)

In [9]:
def get_operation_type(name):
    return name.split("_")[-1]

In [10]:
df["operation_type"] = df["name"].map(get_operation_type)

In [11]:
def get_specific(name):
    return name.split("_")[1]

df["specific_type"] = df["name"].map(get_specific)

## Broad Groups

In [12]:
fig = px.box(
    df,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Feature Importance Across Folds<br><sup>Broad Groups of feature</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

## Paragraph Features

In [13]:
paragraph_features = df.loc[df.broad_group == "paragraph"]
print("Shape of paragraph features:", paragraph_features.shape)

Shape of paragraph features: (28, 5)


In [14]:
fig = px.box(
    paragraph_features,
    x="importance",
    color="specific_type",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Paragraph Feature Importance",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

## DeBERTa Features

In [15]:
deberta_features = df.loc[df.broad_group.isin(["deberta"])]
print("Shape of deberta features:", deberta_features.shape)

Shape of deberta features: (0, 5)


In [16]:
def get_model_num(name):
    return int(name.split("_")[1][1])


deberta_features["model"] = deberta_features.name.map(get_model_num)

In [17]:
fig = px.box(
    deberta_features,
    x="importance",
    color="model",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="DeBERTa Feature Importance",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

## Word Features

In [18]:
word_features = df.loc[df.broad_group.isin(["word"])]
print("Shape of word features:", word_features.shape)

Shape of word features: (22, 5)


In [19]:
def get_specific(name):
    return name.split("_")[1]

word_features["specific"] = word_features["name"].map(get_specific)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [20]:
fig = px.box(
    word_features,
    x="importance",
    color="specific",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Word Feature Importance",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

## Grammar Features

In [21]:
grammar_features = df.loc[df.broad_group.isin(["grammar"])]
print("Shape of grammar features:", grammar_features.shape)

Shape of grammar features: (4, 5)


In [22]:
def get_specific(name):
    return name.split("_")[1]

grammar_features["specific"] = grammar_features["name"].map(get_specific)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
fig = px.box(
    grammar_features,
    x="importance",
    color="specific",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Grammar Feature Importances<br><sup>Type Overview</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

In [24]:
fig = px.box(
    grammar_features,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)
fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Grammar Feature Importances<br><sup>Operation Type Overview</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()