In [1]:
import pandas as pd
import os
import pickle as pkl
from pprint import pprint
from plotly import express as px
from lightgbm import LGBMRegressor

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.model.utils import qwk_obj

In [4]:
df = pd.DataFrame()
names = None

for fold in range(config.lgbm_n_folds):
    model: LGBMRegressor = pkl.load(open(f"output/LGBM_1/{fold}.pkl", "rb"))["model"]
    importances = model.feature_importances_
    
    if not names:
        names = model.feature_name_
        df = pd.concat([df, pd.DataFrame({"name": names})], axis=1)

    temp = pd.DataFrame({f"LGBM_{fold}": importances})
    df = pd.concat([df, temp], axis=1)

In [5]:
df.head(10)

Unnamed: 0,name,LGBM_0,LGBM_1,LGBM_2,LGBM_3,LGBM_4,LGBM_5,LGBM_6,LGBM_7,LGBM_8,...,LGBM_15,LGBM_16,LGBM_17,LGBM_18,LGBM_19,LGBM_20,LGBM_21,LGBM_22,LGBM_23,LGBM_24
0,score_prob_0,346,231,268,183,205,320,299,254,260,...,336,186,246,322,215,275,310,253,252,349
1,score_prob_1,292,199,238,145,221,263,223,218,177,...,237,129,193,230,229,221,224,185,198,233
2,score_prob_2,341,271,305,285,260,345,339,315,300,...,367,270,321,336,255,360,375,305,327,354
3,score_prob_3,210,197,199,148,142,219,244,237,211,...,259,169,225,244,157,217,230,195,225,268
4,score_prob_4,170,164,150,150,146,175,197,187,180,...,210,137,183,206,136,186,179,172,170,177
5,score_prob_5,298,193,246,134,185,289,235,221,204,...,258,128,178,248,198,231,253,209,202,256
6,paragraph_error_count_mean,2,4,3,4,2,1,3,5,0,...,2,5,4,0,4,1,2,4,1,5
7,paragraph_error_count_min,1,0,0,0,1,0,1,1,1,...,2,0,0,1,0,0,0,0,0,1
8,paragraph_error_count_max,1,1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,1,0
9,paragraph_error_count_sum,1,1,0,0,1,1,2,1,0,...,2,0,2,1,0,2,2,1,0,3


In [6]:
df["importance"] = df.loc[:, "LGBM_0":].median(axis=1)
df.sort_values(by="importance", ascending=False, inplace=True)
df = df[["name", "importance"]]

## Overall

Top 10 most important features

In [7]:
df.head(10)

Unnamed: 0,name,importance
2,score_prob_2,321.0
0,score_prob_0,260.0
5,score_prob_5,221.0
1,score_prob_1,218.0
3,score_prob_3,217.0
4,score_prob_4,171.0
15,paragraph_char_count_sum,109.0
21,paragraph_word_count_sum,99.0
145,sentence_word_count_sum,98.0
139,sentence_char_count_sum,93.0


Top 10 least important features

In [8]:
df.tail(10)

Unnamed: 0,name,importance
7511,tfidf_7258,0.0
7510,tfidf_7257,0.0
7509,tfidf_7256,0.0
7508,tfidf_7255,0.0
7507,tfidf_7254,0.0
7506,tfidf_7253,0.0
7505,tfidf_7252,0.0
7504,tfidf_7251,0.0
7503,tfidf_7250,0.0
22049,tfidf_count_2169,0.0


## Broad Groups

In [9]:
def get_broad_group(name):
    group = name.split("_")[0]

    if group == "tfidf" and name.split("_")[1] == "count":
        return "tfidf_count"
    
    return group

In [10]:
df["broad_group"] = df["name"].map(get_broad_group)

In [11]:
px.box(
    df,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

From an high level, it looks like
* `score` probabilities from DeBERTa model has he highest importance.
* `Paragraph` and `sentence` features also seem to be quite important as well.
* `word` seems to be the least important.
* `tfidf` and `count` have less importance.

In [12]:
def get_count_type(name):
    return "_".join(name.split("_")[1:3])

In [13]:
df["count_type"] = df["name"].map(get_count_type)

## Broad Operations

In [14]:
def get_operation_type(name):
    last_word = name.split("_")[-1]
    if last_word in ["sum", "max", "mean", "min", "q1", "q3", "first", "last", "kurtosis"]:
        return last_word
    
    if name.find("len_goe") != -1:
        return "greater_than"
    elif name.find("len_l") != -1:
        return "less_than"
    return "None"

In [15]:
df["operation_type"] = df["name"].map(get_operation_type)

In [16]:
px.box(
    df,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)

Broadly, `min`, `kurtosis`, `q1`, `first` are worthless, regardless of broad group.

### Greater Than

In [17]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [18]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [19]:
df[(df.operation_type == "greater_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
206,sentence_error_count_len_goe_15,32.0,sentence,error_count,greater_than
184,sentence_word_count_len_goe_20,25.0,sentence,word_count,greater_than
91,paragraph_sentence_count_len_goe_5,22.0,paragraph,sentence_count,greater_than
208,sentence_error_count_len_goe_20,14.0,sentence,error_count,greater_than
95,paragraph_sentence_count_len_goe_15,13.0,paragraph,sentence_count,greater_than
182,sentence_word_count_len_goe_10,10.0,sentence,word_count,greater_than


**Verdict**

1. For paragraphs
   1. Count only sentence and error. Upper limit be 15.
2. For sentences
   1. Count only error and word. Upper limit be 20.

### Less Than

In [20]:
px.box(
    df[df.operation_type == "less_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [21]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [22]:
df[(df.operation_type == "less_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
205,sentence_error_count_len_l_15,47.0,sentence,error_count,less_than
183,sentence_word_count_len_l_20,38.0,sentence,word_count,less_than
90,paragraph_sentence_count_len_l_5,32.0,paragraph,sentence_count,less_than
207,sentence_error_count_len_l_20,23.0,sentence,error_count,less_than
94,paragraph_sentence_count_len_l_15,20.0,paragraph,sentence_count,less_than
181,sentence_word_count_len_l_10,16.0,sentence,word_count,less_than
110,paragraph_error_count_len_l_5,15.0,paragraph,error_count,less_than
96,paragraph_sentence_count_len_l_20,15.0,paragraph,sentence_count,less_than
157,sentence_char_count_len_l_25,12.0,sentence,char_count,less_than


**Verdict**

1. For paragraphs: Count only sentence and error. Upper limit be 20.
2. For sentences: Count only error and word. Upper limit be 20.

## Word Features

In [23]:
word_features = df.loc[df.broad_group == "word"]
print("Shape of word features:", word_features.shape)

Shape of word features: (32, 5)


In [24]:
px.box(
    word_features,
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [25]:
px.box(
    word_features,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [26]:
word_features[word_features.importance != 0]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
221,word_char_count_mean,4.0,word,char_count,mean
226,word_char_count_<lambda_2>,1.0,word,char_count,


Only `mean` of word length is important.

## Paragraph Features

In [27]:
paragraph_features = df.loc[df.broad_group == "paragraph"]
print("Shape of paragraph features:", paragraph_features.shape)

Shape of paragraph features: (124, 5)


In [28]:
px.box(
    paragraph_features,
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

## TFIDF and Count

In [29]:
tfidf_features = df.loc[df.broad_group.isin(["tfidf", "tfidf_count"])]
print("Shape of tfidf features:", tfidf_features.shape)

Shape of tfidf features: (21797, 5)


In [35]:
px.box(
    tfidf_features,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

TFIDF and count features are worthles.