In [1]:
import pandas as pd
import os
import pickle as pkl
from pprint import pprint
from plotly import express as px
from lightgbm import LGBMRegressor

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.model.utils import qwk_obj

In [4]:
df = pd.DataFrame()
names = None

for fold in range(config.lgbm_n_folds):
    model: LGBMRegressor = pkl.load(open(f"output/LGBM/{fold}.pkl", "rb"))["model"]
    importances = model.feature_importances_

    if not names:
        names = model.feature_name_
        df = pd.concat([df, pd.DataFrame({"name": names})], axis=1)

    temp = pd.DataFrame({f"LGBM_{fold}": importances})
    df = pd.concat([df, temp], axis=1)

In [5]:
df.head(10)

Unnamed: 0,name,LGBM_0,LGBM_1,LGBM_2,LGBM_3,LGBM_4,LGBM_5,LGBM_6,LGBM_7,LGBM_8,...,LGBM_15,LGBM_16,LGBM_17,LGBM_18,LGBM_19,LGBM_20,LGBM_21,LGBM_22,LGBM_23,LGBM_24
0,deberta_m0_c0,107,138,122,118,131,115,149,131,156,...,73,118,129,134,119,71,97,74,175,119
1,deberta_m0_c1,89,111,97,99,112,93,122,123,121,...,60,74,105,111,112,70,85,52,130,95
2,deberta_m0_c2,105,117,114,115,110,98,124,106,113,...,87,106,111,109,110,75,104,80,131,104
3,deberta_m0_c3,68,63,68,57,66,70,81,73,90,...,38,69,74,70,62,48,54,42,97,62
4,deberta_m0_c4,51,65,59,60,68,54,72,77,81,...,40,56,65,77,70,44,57,45,75,65
5,deberta_m0_c5,29,41,35,30,39,39,48,45,42,...,20,31,43,39,33,22,25,22,53,30
6,deberta_m1_c0,54,70,78,80,67,54,95,84,93,...,35,49,68,70,77,38,41,45,93,67
7,deberta_m1_c1,67,63,64,74,79,62,72,73,87,...,37,68,72,76,76,51,80,39,78,62
8,deberta_m1_c2,96,113,89,99,103,90,87,99,98,...,69,90,107,97,87,68,74,71,119,110
9,deberta_m1_c3,67,46,54,70,55,61,61,69,65,...,32,48,73,59,57,40,47,33,110,53


In [6]:
df["importance"] = df.loc[:, "LGBM_0":].max(axis=1)
df.sort_values(by="importance", ascending=False, inplace=True)
df = df[["name", "importance"]]

## Overall

Top 10 most important features

In [7]:
df.head(10)

Unnamed: 0,name,importance
18,deberta_m3_c0,194
0,deberta_m0_c0,175
12,deberta_m2_c0,159
20,deberta_m3_c2,150
2,deberta_m0_c2,131
1,deberta_m0_c1,130
86,sentence_word_count_sum,129
8,deberta_m1_c2,119
14,deberta_m2_c2,116
36,deberta_m6_c0,116


Top 10 least important features

In [8]:
df.tail(10)

Unnamed: 0,name,importance
121,tfidf_12,1
175,tfidf_66,1
102,sentence_error_count_len_goe_1,0
101,sentence_error_count_len_l_1,0
63,paragraph_sentence_count_len_goe_1,0
94,sentence_word_count_len_goe_1,0
93,sentence_word_count_len_l_1,0
70,paragraph_error_count_len_l_1,0
71,paragraph_error_count_len_goe_1,0
62,paragraph_sentence_count_len_l_1,0


## Broad Groups

In [9]:
def get_broad_group(name):
    group = name.split("_")[0]

    if group == "tfidf" and name.split("_")[1] == "count":
        return "tfidf_count"

    return group

In [10]:
df["broad_group"] = df["name"].map(get_broad_group)

In [11]:
px.box(
    df,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

From an high level, it looks like
* `score` probabilities from DeBERTa model has he highest importance.
* `Paragraph` and `sentence` features also seem to be quite important as well.
* `word` and `tfidf_count` seems to be the least important.

In [12]:
def get_count_type(name):
    return "_".join(name.split("_")[1:3])

In [13]:
df["count_type"] = df["name"].map(get_count_type)

## Broad Operations

In [14]:
def get_operation_type(name):
    last_word = name.split("_")[-1]
    if last_word in [
        "sum",
        "max",
        "mean",
        "min",
        "q1",
        "q3",
    ]:
        return last_word

    if name.find("len_goe") != -1:
        return "greater_than"
    elif name.find("len_l") != -1:
        return "less_than"
    return "None"

In [15]:
df["operation_type"] = df["name"].map(get_operation_type)

In [16]:
px.box(
    df,
    x="importance",
    color="operation_type",
    range_x=(df.importance.min(), df.importance.max()),
)

Broadly, `min`, `kurtosis`, `q1`, `first` are worthless, regardless of broad group.

### Greater Than

In [17]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [18]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [19]:
df[(df.operation_type == "greater_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
69,paragraph_sentence_count_len_goe_16,45,paragraph,sentence_count,greater_than
77,paragraph_error_count_len_goe_16,23,paragraph,error_count,greater_than
100,sentence_word_count_len_goe_16,18,sentence,word_count,greater_than
65,paragraph_sentence_count_len_goe_6,13,paragraph,sentence_count,greater_than
67,paragraph_sentence_count_len_goe_11,11,paragraph,sentence_count,greater_than
108,sentence_error_count_len_goe_16,11,sentence,error_count,greater_than


**Verdict**

1. For paragraphs
   1. Count only sentence and error. Upper limit be 15.
2. For sentences
   1. Count only error and word. Upper limit be 20.

### Less Than

In [20]:
px.box(
    df[df.operation_type == "less_than"],
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

In [21]:
px.box(
    df[df.operation_type == "greater_than"],
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [22]:
df[(df.operation_type == "less_than") & (df.importance >= 10)]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
68,paragraph_sentence_count_len_l_16,55,paragraph,sentence_count,less_than
76,paragraph_error_count_len_l_16,34,paragraph,error_count,less_than
99,sentence_word_count_len_l_16,22,sentence,word_count,less_than
64,paragraph_sentence_count_len_l_6,20,paragraph,sentence_count,less_than
66,paragraph_sentence_count_len_l_11,16,paragraph,sentence_count,less_than
107,sentence_error_count_len_l_16,14,sentence,error_count,less_than
72,paragraph_error_count_len_l_6,11,paragraph,error_count,less_than


**Verdict**

1. For paragraphs: Count only sentence and error. Upper limit be 20.
2. For sentences: Count only error and word. Upper limit be 20.

## Paragraph Features

In [23]:
paragraph_features = df.loc[df.broad_group == "paragraph"]
print("Shape of paragraph features:", paragraph_features.shape)

Shape of paragraph features: (36, 5)


In [24]:
px.box(
    paragraph_features,
    x="importance",
    color="count_type",
    range_x=(df.importance.min(), df.importance.max()),
)

In [25]:
paragraph_features[
    (paragraph_features.count_type == "char_count")
    & (paragraph_features.importance > 0)
]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
46,paragraph_char_count_max,90,paragraph,char_count,max
47,paragraph_char_count_sum,75,paragraph,char_count,sum
45,paragraph_char_count_mean,56,paragraph,char_count,mean
57,paragraph_char_count_q3,50,paragraph,char_count,q3
56,paragraph_char_count_q1,21,paragraph,char_count,q1


In [26]:
paragraph_features[
    (paragraph_features.count_type == "word_count")
    & (paragraph_features.importance > 0)
]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
50,paragraph_word_count_sum,108,paragraph,word_count,sum
49,paragraph_word_count_max,66,paragraph,word_count,max
48,paragraph_word_count_mean,47,paragraph,word_count,mean
59,paragraph_word_count_q3,41,paragraph,word_count,q3
58,paragraph_word_count_q1,18,paragraph,word_count,q1


## TFIDF and Count

In [27]:
tfidf_features = df.loc[df.broad_group == "tfidf"]
print("Shape of tfidf features:", tfidf_features.shape)

Shape of tfidf features: (100, 5)


In [28]:
px.box(
    tfidf_features,
    x="importance",
    color="broad_group",
    range_x=(df.importance.min(), df.importance.max()),
)

TFIDF count features are worthless.

In [29]:
tfidf_features[
    (tfidf_features.broad_group == "tfidf") & (tfidf_features.importance > 0)
]

Unnamed: 0,name,importance,broad_group,count_type,operation_type
135,tfidf_26,66,tfidf,26,
109,tfidf_0,58,tfidf,0,
110,tfidf_1,55,tfidf,1,
176,tfidf_67,35,tfidf,67,
116,tfidf_7,23,tfidf,7,
...,...,...,...,...,...
198,tfidf_89,1,tfidf,89,
158,tfidf_49,1,tfidf,49,
172,tfidf_63,1,tfidf,63,
121,tfidf_12,1,tfidf,12,


## DeBERTa Features

In [30]:
deberta_features = df.loc[df.broad_group.isin(["deberta"])]
print("Shape of deberta features:", deberta_features.shape)

Shape of deberta features: (42, 5)


In [31]:
def get_model_num(name):
    return int(name.split("_")[1][1])


deberta_features["model"] = deberta_features.name.map(get_model_num)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [32]:
px.box(
    deberta_features,
    x="importance",
    color="model",
    range_x=(df.importance.min(), df.importance.max()),
)

### Textstat

In [33]:
textstat_features = df.loc[df.broad_group.isin(["textstat"])]
print("Shape of textstat features:", textstat_features.shape)

Shape of textstat features: (1, 5)


In [34]:
textstat_features["function"] = textstat_features.name.map(lambda x: x[9:])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [35]:
textstat_features[textstat_features.importance > 5]

Unnamed: 0,name,importance,broad_group,count_type,operation_type,function
209,textstat_difficult_words,24,textstat,difficult_words,,difficult_words


### Topic Probabilities

In [36]:
topic_features = df.loc[df.broad_group.isin(["topic"])]
print("Shape of topic features:", topic_features.shape)

Shape of topic features: (7, 5)


In [37]:
def get_topic_num(name):
    return int(name.split("_")[1])

topic_features["topic_num"] = topic_features.name.map(get_topic_num)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
px.bar(
    topic_features,
    x="topic_num",
    y="importance",
    text="importance",
    range_y=(df.importance.min(), df.importance.max()),
)