In [1]:
import pandas as pd
import os
import pickle as pkl
from pprint import pprint
from plotly import express as px
from lightgbm import LGBMRegressor

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.model.utils import qwk_obj

In [4]:
df = pd.DataFrame()
names = None

for fold in range(config.lgbm_n_folds):
    model: LGBMRegressor = pkl.load(open(f"output/LGBM/{fold}.pkl", "rb"))["model"]
    importances = model.feature_importances_
    
    if not names:
        names = model.feature_name_
        df = pd.concat([df, pd.DataFrame({"name": names})], axis=1)

    temp = pd.DataFrame({f"LGBM_{fold}": importances})
    df = pd.concat([df, temp], axis=1)

In [5]:
df.head(10)

Unnamed: 0,name,LGBM_0,LGBM_1,LGBM_2,LGBM_3,LGBM_4,LGBM_5,LGBM_6,LGBM_7,LGBM_8,...,LGBM_15,LGBM_16,LGBM_17,LGBM_18,LGBM_19,LGBM_20,LGBM_21,LGBM_22,LGBM_23,LGBM_24
0,deberta_m0_c0,98,117,95,87,78,108,147,90,136,...,54,119,84,104,114,114,130,83,106,85
1,deberta_m0_c1,86,91,98,73,89,104,117,103,101,...,75,87,83,84,93,104,85,90,95,86
2,deberta_m0_c2,86,91,90,86,83,81,86,85,77,...,66,72,82,83,79,88,88,82,81,79
3,deberta_m0_c3,51,51,46,54,47,67,50,47,68,...,42,63,39,66,45,57,59,45,58,52
4,deberta_m0_c4,46,48,55,38,43,41,58,36,50,...,33,57,43,48,53,49,51,48,46,37
5,deberta_m0_c5,20,33,31,28,25,33,46,30,34,...,19,32,28,24,25,28,29,29,28,34
6,deberta_m1_c0,53,70,54,45,49,58,83,52,74,...,29,57,55,56,58,49,60,45,48,47
7,deberta_m1_c1,50,61,44,47,43,48,46,60,67,...,41,63,58,61,53,62,49,57,66,43
8,deberta_m1_c2,82,87,95,89,80,80,90,82,93,...,68,77,70,75,74,79,83,67,78,81
9,deberta_m1_c3,48,48,39,52,61,45,56,38,57,...,35,31,42,46,48,45,47,32,55,41


In [6]:
df["importance"] = df.loc[:, "LGBM_0":].median(axis=1)
df.sort_values(by="importance", ascending=False, inplace=True)
df = df[["name", "importance"]]

## Overall

Top 10 most important features

In [7]:
df.head(10)

Unnamed: 0,name,importance
18,deberta_m3_c0,110.0
0,deberta_m0_c0,98.0
20,deberta_m3_c2,97.0
12,deberta_m2_c0,89.0
1,deberta_m0_c1,89.0
2,deberta_m0_c2,82.0
8,deberta_m1_c2,80.0
36,deberta_m6_c0,74.0
14,deberta_m2_c2,71.0
21,deberta_m3_c3,69.0


Top 10 least important features

In [8]:
df.tail(10)

Unnamed: 0,name,importance
7459,tfidf_7170,0.0
7458,tfidf_7169,0.0
7457,tfidf_7168,0.0
7456,tfidf_7167,0.0
7455,tfidf_7166,0.0
7454,tfidf_7165,0.0
7453,tfidf_7164,0.0
7452,tfidf_7163,0.0
7451,tfidf_7162,0.0
22085,tfidf_count_2169,0.0


## Broad Groups

In [9]:
def get_broad_group(name):
    group = name.split("_")[0]

    if group == "tfidf" and name.split("_")[1] == "count":
        return "tfidf_count"
    
    return group

In [10]:
df["broad_group"] = df["name"].map(get_broad_group)

In [11]:
px.box(
    df,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

From an high level, it looks like
* `score` probabilities from DeBERTa model has he highest importance.
* `Paragraph` and `sentence` features also seem to be quite important as well.
* `word` seems to be the least important.
* `tfidf` and `count` have less importance.

In [12]:
def get_count_type(name):
    return "_".join(name.split("_")[1:3])

In [13]:
df["count_type"] = df["name"].map(get_count_type)

## Broad Operations

In [14]:
def get_operation_type(name):
    last_word = name.split("_")[-1]
    if last_word in ["sum", "max", "mean", "min", "q1", "q3", "first", "last", "kurtosis"]:
        return last_word
    
    if name.find("len_goe") != -1:
        return "greater_than"
    elif name.find("len_l") != -1:
        return "less_than"
    return "None"

In [15]:
df["operation_type"] = df["name"].map(get_operation_type)

In [16]:
px.box(
    df,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)

Broadly, `min`, `kurtosis`, `q1`, `first` are worthless, regardless of broad group.

### Greater Than

In [17]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [18]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [19]:
df[(df.operation_type == "greater_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
127,paragraph_sentence_count_len_goe_5,44.0,paragraph,sentence_count,greater_than
147,paragraph_error_count_len_goe_5,19.0,paragraph,error_count,greater_than
131,paragraph_sentence_count_len_goe_15,12.0,paragraph,sentence_count,greater_than


**Verdict**

1. For paragraphs
   1. Count only sentence and error. Upper limit be 15.
2. For sentences
   1. Count only error and word. Upper limit be 20.

### Less Than

In [20]:
px.box(
    df[df.operation_type == "less_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [21]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [22]:
df[(df.operation_type == "less_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
126,paragraph_sentence_count_len_l_5,52.0,paragraph,sentence_count,less_than
146,paragraph_error_count_len_l_5,28.0,paragraph,error_count,less_than
130,paragraph_sentence_count_len_l_15,27.0,paragraph,sentence_count,less_than
132,paragraph_sentence_count_len_l_20,19.0,paragraph,sentence_count,less_than
221,sentence_word_count_len_l_30,13.0,sentence,word_count,less_than
249,sentence_error_count_len_l_35,12.0,sentence,error_count,less_than
150,paragraph_error_count_len_l_15,11.0,paragraph,error_count,less_than
223,sentence_word_count_len_l_40,10.0,sentence,word_count,less_than


**Verdict**

1. For paragraphs: Count only sentence and error. Upper limit be 20.
2. For sentences: Count only error and word. Upper limit be 20.

## Word Features

In [23]:
word_features = df.loc[df.broad_group == "word"]
print("Shape of word features:", word_features.shape)

Shape of word features: (32, 5)


In [24]:
px.box(
    word_features,
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [25]:
px.box(
    word_features,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [26]:
word_features[word_features.importance != 0]

Unnamed: 0,name,importance,broad_group,count_type,operation_type


Only `mean` of word length is important.

## Paragraph Features

In [27]:
paragraph_features = df.loc[df.broad_group == "paragraph"]
print("Shape of paragraph features:", paragraph_features.shape)

Shape of paragraph features: (124, 5)


In [28]:
px.box(
    paragraph_features,
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

## TFIDF and Count

In [29]:
tfidf_features = df.loc[df.broad_group.isin(["tfidf", "tfidf_count"])]
print("Shape of tfidf features:", tfidf_features.shape)

Shape of tfidf features: (21797, 5)


In [30]:
px.box(
    tfidf_features,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

TFIDF and count features are worthles.

## DeBERTa Features

In [31]:
deberta_features = df.loc[df.broad_group.isin(["deberta"])]
print("Shape of deberta features:", deberta_features.shape)

Shape of deberta features: (42, 5)


In [33]:
def get_model_num(name):
    return int(name.split("_")[1][1])

deberta_features["model"] = deberta_features.name.map(get_model_num)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [35]:
px.box(
    deberta_features,
    x="importance",
    color="model",
    range_x=(df.importance.min(), df.importance.max()),
)