In [164]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [165]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [166]:
train.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,...,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,...,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,...,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,...,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,...,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [167]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593471 entries, 0 to 593470
Data columns (total 33 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Identifier          

In [168]:
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Accident Date,570337.0,5539.0,2020-03-01,1245.0,,,,,,,
Age at Injury,574026.0,,,,42.11427,14.256432,0.0,31.0,42.0,54.0,117.0
Alternative Dispute Resolution,574026.0,3.0,N,571412.0,,,,,,,
Assembly Date,593471.0,1096.0,2020-03-06,1422.0,,,,,,,
Attorney/Representative,574026.0,2.0,N,392291.0,,,,,,,
Average Weekly Wage,545375.0,,,,491.088321,6092.91812,0.0,0.0,0.0,841.0,2828079.0
Birth Year,544948.0,,,,1886.767604,414.644423,0.0,1965.0,1977.0,1989.0,2018.0
C-2 Date,559466.0,2475.0,2021-05-11,1847.0,,,,,,,
C-3 Date,187245.0,1648.0,2021-04-21,350.0,,,,,,,
Carrier Name,574026.0,2046.0,STATE INSURANCE FUND,111144.0,,,,,,,


In [169]:
for i in train.columns:
    print(i, train[i].value_counts())

Accident Date Accident Date
2020-03-01    1245
2020-12-18    1001
2022-02-07     977
2022-01-05     883
2021-02-18     851
              ... 
2017-05-17       1
2015-05-28       1
2002-08-18       1
2017-09-01       1
1980-05-17       1
Name: count, Length: 5539, dtype: int64
Age at Injury Age at Injury
31.0     14041
30.0     14022
32.0     13994
29.0     13657
51.0     13486
         ...  
115.0        1
5.0          1
104.0        1
113.0        1
114.0        1
Name: count, Length: 108, dtype: int64
Alternative Dispute Resolution Alternative Dispute Resolution
N    571412
Y      2609
U         5
Name: count, dtype: int64
Assembly Date Assembly Date
2020-03-06    1422
2021-05-11    1386
2022-04-01    1256
2022-02-04    1217
2022-08-15    1170
              ... 
2022-07-31      10
2021-12-19      10
2020-08-23      10
2020-07-04      10
2022-05-08      10
Name: count, Length: 1096, dtype: int64
Attorney/Representative Attorney/Representative
N    392291
Y    181735
Name: count, dtype

In [170]:
for i in train.columns:
    print(i, train[i].unique())

Accident Date ['2019-12-30' '2019-08-30' '2019-12-06' ... '2000-05-24' '2007-05-01'
 '1980-05-17']
Age at Injury [ 31.  46.  40.  nan  61.  67.  48.  33.  55.  20.  21.  51.  62.  35.
  54.  32.  34.  38.  30.  36.  39.  44.  56.  29.  60.  49.  50.  18.
  58.  24.  45.  53.  52.  47.  25.  23.  22.  64.  70.  41.  69.  57.
  27.  42.  77.  72.  19.  28.  63.  65.  59.  43.  26.  74.  66.  76.
  68.  37.   0.  16.  71.  82.  75.  17.  78.  73.  89.  85.  84.  81.
  87.  80.  11.  88.  79.  15.  83.  86.  90.   1.  10. 102.  94. 117.
  14. 113.  95.  92. 104.   5.   8. 115. 110.  91.  96.  99.  12.  13.
  93. 101.   7. 111.   9.  97. 112. 109. 100.  98. 114.]
Alternative Dispute Resolution ['N' nan 'Y' 'U']
Assembly Date ['2020-01-01' '2020-01-02' '2020-01-03' ... '2022-12-29' '2022-12-30'
 '2022-12-31']
Attorney/Representative ['N' 'Y' nan]
Average Weekly Wage [   0.   1745.93 1434.8  ...  371.02 2277.28 1008.86]
Birth Year [1988. 1973. 1979.   nan 1958. 1952. 1971. 1986. 1964.    0. 1

In [171]:
train.set_index('Claim Identifier', inplace=True)

In [172]:
train.drop_duplicates(inplace=True)

In [173]:
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

In [174]:
train_num = train.select_dtypes(include=np.number).columns.tolist()
train_cat = train.select_dtypes(exclude=np.number).columns.tolist()

In [175]:
# Numerical columns: Impute with mean
num_imputer = SimpleImputer(strategy="mean")
train[train_num] = pd.DataFrame(
    num_imputer.fit_transform(train[train_num]),
    columns=train_num,
    index=train.index
)

# Categorical columns: Impute with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
train[train_cat] = pd.DataFrame(
    cat_imputer.fit_transform(train[train_cat]),
    columns=train_cat,
    index=train.index
)

In [176]:
X = train.drop('Claim Injury Type', axis=1)
y = train['Claim Injury Type']

In [177]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y, 
                                                  shuffle = True)

In [178]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 402584 entries, 6009728 to 5787010
Data columns (total 30 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Accident Date                      402584 non-null  object 
 1   Age at Injury                      402584 non-null  float64
 2   Alternative Dispute Resolution     402584 non-null  object 
 3   Assembly Date                      402584 non-null  object 
 4   Attorney/Representative            402584 non-null  object 
 5   Average Weekly Wage                402584 non-null  float64
 6   Birth Year                         402584 non-null  float64
 7   C-2 Date                           402584 non-null  object 
 8   C-3 Date                           402584 non-null  object 
 9   Carrier Name                       402584 non-null  object 
 10  Carrier Type                       402584 non-null  object 
 11  County of Injury                   40

In [179]:
X_train_num = X_train.select_dtypes(include=np.number).set_index(X_train.index)
X_train_cat = X_train.select_dtypes(exclude=np.number).set_index(X_train.index)

X_val_num = X_val.select_dtypes(include=np.number).set_index(X_val.index)
X_val_cat = X_val.select_dtypes(exclude=np.number).set_index(X_val.index)

In [180]:
scaler = MinMaxScaler()
scaler.fit(X_train_num) #fit to training data
X_train_num_scaled = scaler.transform(X_train_num) # this will return an array
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train.index) # Convert the array to a pandas dataframe

In [181]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val.index)

In [182]:
label_encoders = {}
for col in X_train_cat.columns:
    le = LabelEncoder()
    combined_data = pd.concat([X_train_cat[col], X_val_cat[col]], axis=0).astype(str)
    le.fit(combined_data)
    X_train_cat[col] = le.transform(X_train_cat[col].astype(str))
    X_val_cat[col] = le.transform(X_val_cat[col].astype(str))
    label_encoders[col] = le

In [183]:
ordinal_encoder = OrdinalEncoder()
y_train_encoded = ordinal_encoder.fit_transform(y_train.values.reshape(-1, 1))
y_val_encoded = ordinal_encoder.transform(y_val.values.reshape(-1, 1))

In [184]:
""" X_train_num_scaled.var() """

' X_train_num_scaled.var() '

In [185]:
""" def cor_heatmap(cor):
    plt.figure(figsize=(12,10))
    sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.1')
    plt.show() """

" def cor_heatmap(cor):\n    plt.figure(figsize=(12,10))\n    sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.1')\n    plt.show() "

In [186]:
""" cor_spearman = X_train_num_scaled.corr(method ='spearman')
cor_heatmap(cor_spearman) """

" cor_spearman = X_train_num_scaled.corr(method ='spearman')\ncor_heatmap(cor_spearman) "

In [187]:
""" def TestIndependence(X,y,var,alpha=0.05):        
    dfObserved = pd.crosstab(y,X) 
    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
    dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
    if p<alpha:
        result="{0} is IMPORTANT for Prediction".format(var)
    else:
        result="{0} is NOT an important predictor. (Discard {0} from model)".format(var)
    print(result) """

' def TestIndependence(X,y,var,alpha=0.05):        \n    dfObserved = pd.crosstab(y,X) \n    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)\n    dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)\n    if p<alpha:\n        result="{0} is IMPORTANT for Prediction".format(var)\n    else:\n        result="{0} is NOT an important predictor. (Discard {0} from model)".format(var)\n    print(result) '

In [188]:
""" for var in X_train_cat:
    TestIndependence(X_train_cat[var],y_train, var) """

' for var in X_train_cat:\n    TestIndependence(X_train_cat[var],y_train, var) '

In [189]:
""" # first join all the training data
all_train = X_train.join(y_train)


def bar_charts_categorical(df, feature, target):
    cont_tab = pd.crosstab(df[feature], df[target], margins = True)
    categories = cont_tab.index[:-1]
        
    fig = plt.figure(figsize=(15, 5))
    
    plt.subplot(121)
    p1 = plt.bar(categories, cont_tab.iloc[:-1, 0].values, 0.55, color="gray")
    p2 = plt.bar(categories, cont_tab.iloc[:-1, 1].values, 0.55, bottom=cont_tab.iloc[:-1, 0], color="yellowgreen")
    plt.legend((p2[0], p1[0]), ('$y_i=1$', '$y_i=0$'))
    plt.title("Frequency bar chart")
    plt.xlabel(feature)
    plt.ylabel("$Frequency$")

    # auxiliary data for 122
    obs_pct = np.array([np.divide(cont_tab.iloc[:-1, 0].values, cont_tab.iloc[:-1, 2].values), 
                        np.divide(cont_tab.iloc[:-1, 1].values, cont_tab.iloc[:-1, 2].values)])
      
    plt.subplot(122)
    p1 = plt.bar(categories, obs_pct[0], 0.55, color="gray")
    p2 = plt.bar(categories, obs_pct[1], 0.55, bottom=obs_pct[0], color="yellowgreen")
    plt.legend((p2[0], p1[0]), ('$y_i=1$', '$y_i=0$'))
    plt.title("Proportion bar chart")
    plt.xlabel(feature)
    plt.ylabel("$p$")

    plt.show()
    
bar_charts_categorical(all_train, "Sex", "Survived") """

' # first join all the training data\nall_train = X_train.join(y_train)\n\n\ndef bar_charts_categorical(df, feature, target):\n    cont_tab = pd.crosstab(df[feature], df[target], margins = True)\n    categories = cont_tab.index[:-1]\n        \n    fig = plt.figure(figsize=(15, 5))\n    \n    plt.subplot(121)\n    p1 = plt.bar(categories, cont_tab.iloc[:-1, 0].values, 0.55, color="gray")\n    p2 = plt.bar(categories, cont_tab.iloc[:-1, 1].values, 0.55, bottom=cont_tab.iloc[:-1, 0], color="yellowgreen")\n    plt.legend((p2[0], p1[0]), (\'$y_i=1$\', \'$y_i=0$\'))\n    plt.title("Frequency bar chart")\n    plt.xlabel(feature)\n    plt.ylabel("$Frequency$")\n\n    # auxiliary data for 122\n    obs_pct = np.array([np.divide(cont_tab.iloc[:-1, 0].values, cont_tab.iloc[:-1, 2].values), \n                        np.divide(cont_tab.iloc[:-1, 1].values, cont_tab.iloc[:-1, 2].values)])\n      \n    plt.subplot(122)\n    p1 = plt.bar(categories, obs_pct[0], 0.55, color="gray")\n    p2 = plt.bar(cat

In [190]:
""" model = LogisticRegression() """

' model = LogisticRegression() '

In [191]:
num_features = X_train_num_scaled.shape[1] + 1

In [192]:
""" #no of features
nof_list=np.arange(1,num_features)            
high_score=0
#Variable to store the optimum features
nof=0           
train_score_list =[]
val_score_list = []

for n in range(len(nof_list)):
    model = LogisticRegression()
    
    rfe = RFE(estimator = model,n_features_to_select = nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train_num_scaled,y_train)
    X_val_rfe = rfe.transform(X_val_num_scaled)
    model.fit(X_train_rfe,y_train)
    
    #storing results on training data
    train_score = model.score(X_train_rfe,y_train)
    train_score_list.append(train_score)
    
    #storing results on training data
    val_score = model.score(X_val_rfe,y_val)
    val_score_list.append(val_score)
    
    #check best score
    if(val_score >= high_score):
        high_score = val_score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score)) """

' #no of features\nnof_list=np.arange(1,num_features)            \nhigh_score=0\n#Variable to store the optimum features\nnof=0           \ntrain_score_list =[]\nval_score_list = []\n\nfor n in range(len(nof_list)):\n    model = LogisticRegression()\n    \n    rfe = RFE(estimator = model,n_features_to_select = nof_list[n])\n    X_train_rfe = rfe.fit_transform(X_train_num_scaled,y_train)\n    X_val_rfe = rfe.transform(X_val_num_scaled)\n    model.fit(X_train_rfe,y_train)\n    \n    #storing results on training data\n    train_score = model.score(X_train_rfe,y_train)\n    train_score_list.append(train_score)\n    \n    #storing results on training data\n    val_score = model.score(X_val_rfe,y_val)\n    val_score_list.append(val_score)\n    \n    #check best score\n    if(val_score >= high_score):\n        high_score = val_score\n        nof = nof_list[n]\nprint("Optimum number of features: %d" %nof)\nprint("Score with %d features: %f" % (nof, high_score)) '

In [193]:
""" plt.plot(list(range(1,num_features)), train_score_list, label="Score on Training Set", color='yellowgreen')
plt.plot(list(range(1,num_features)), val_score_list, label="Score on Test Set", color='dimgray')
plt.xlabel("Maximum Depth")
plt.ylabel("Score")
plt.legend()
plt.show() """

' plt.plot(list(range(1,num_features)), train_score_list, label="Score on Training Set", color=\'yellowgreen\')\nplt.plot(list(range(1,num_features)), val_score_list, label="Score on Test Set", color=\'dimgray\')\nplt.xlabel("Maximum Depth")\nplt.ylabel("Score")\nplt.legend()\nplt.show() '

In [194]:
""" rfe = RFE(estimator = model, n_features_to_select = 8) """

' rfe = RFE(estimator = model, n_features_to_select = 8) '

In [195]:
""" X_rfe = rfe.fit_transform(X = X_train_num_scaled, y = y_train) """

' X_rfe = rfe.fit_transform(X = X_train_num_scaled, y = y_train) '

In [196]:
""" X_train_num_scaled.columns """

' X_train_num_scaled.columns '

In [197]:
""" rfe.support_ """

' rfe.support_ '

In [198]:
""" rfe.ranking_ """

' rfe.ranking_ '

In [199]:
""" selected_features = pd.Series(rfe.support_, index = X_train_num_scaled.columns)
selected_features """

' selected_features = pd.Series(rfe.support_, index = X_train_num_scaled.columns)\nselected_features '

In [200]:
""" def plot_importance(coef,name):
    imp_coef = coef.sort_values()
    plt.figure(figsize=(8,10))
    imp_coef.plot(kind = "barh")
    plt.title("Feature importance using " + name + " Model")
    plt.show() """

' def plot_importance(coef,name):\n    imp_coef = coef.sort_values()\n    plt.figure(figsize=(8,10))\n    imp_coef.plot(kind = "barh")\n    plt.title("Feature importance using " + name + " Model")\n    plt.show() '

In [201]:
""" reg = LassoCV() """

' reg = LassoCV() '

In [202]:
""" reg.fit(X_train_num_scaled, y_train_encoded) """

' reg.fit(X_train_num_scaled, y_train_encoded) '

In [203]:
""" coef = pd.Series(reg.coef_, index = X_train_num_scaled.columns)
coef """

' coef = pd.Series(reg.coef_, index = X_train_num_scaled.columns)\ncoef '

In [204]:
""" print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables") """

' print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables") '

In [205]:
""" coef.sort_values() """

' coef.sort_values() '

In [206]:
""" plot_importance(coef,'Lasso') """

" plot_importance(coef,'Lasso') "

In [207]:
var = ['IME-4 Count', 'Average Weekly Wage']
corr = ['Age at Injury'] #'Birth Year'
ind = ['WCB Decision']
log = ['WCIO Cause of Injury Code', 'Number of Dependents']
lasso = ['IME-4 Count', 'Number of Dependents']

In [208]:
from itertools import combinations

# Lista de colunas para testar
to_drop = ['IME-4 Count', 'Average Weekly Wage', 'Birth Year', 
           'Age at Injury', 'Number of Dependents', 'WCB Decision', 
           'WCIO Cause of Injury Code']

# Gerar todas as combinações (de 1 até o tamanho total da lista)
all_combinations = []
for r in range(1, len(to_drop) + 1):
    all_combinations.extend(combinations(to_drop, r))

In [209]:
X_train_combined = pd.concat([X_train_num_scaled, X_train_cat], axis=1)
X_val_combined = pd.concat([X_val_num_scaled, X_val_cat], axis=1)

In [210]:
# Variáveis para armazenar os resultados
results = []

# Iterar por todas as combinações
for cols_to_drop in all_combinations:
    # Criar datasets ajustados
    cols_to_drop = list(cols_to_drop)
    X_train_test = X_train_combined.drop(columns=cols_to_drop, errors='ignore')
    X_val_test = X_val_combined.drop(columns=cols_to_drop, errors='ignore')
    
    # Treinar o modelo
    dt = DecisionTreeClassifier().fit(X_train_test, y_train_encoded)
    predictions_train = dt.predict(X_train_test)
    predictions_val = dt.predict(X_val_test)
    
    # Avaliar o modelo
    score = f1_score(y_val_encoded, predictions_val, average='macro')
    
    # Armazenar os resultados
    results.append({
        'cols_to_drop': cols_to_drop,
        'score': score
    })


In [211]:
# Ordenar os resultados pela acurácia
sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)

# Melhor combinação
best_combination = sorted_results[0]
print(f"Melhor combinação de colunas a remover: {best_combination['cols_to_drop']}")
print(f"F1_score correspondente: {best_combination['score']}")

Melhor combinação de colunas a remover: ['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code']
F1_score correspondente: 0.3895483294748636


In [212]:
# Criar um DataFrame com os resultados
results_df = pd.DataFrame(results)

# Exibir as combinações ordenadas por acurácia
print(results_df.sort_values(by='score', ascending=False))

                                          cols_to_drop     score
94   [Birth Year, Age at Injury, Number of Dependen...  0.389548
118  [Birth Year, Age at Injury, Number of Dependen...  0.389159
55   [Birth Year, Age at Injury, WCIO Cause of Inju...  0.387450
96   [Birth Year, Number of Dependents, WCB Decisio...  0.387342
57   [Birth Year, Number of Dependents, WCIO Cause ...  0.387078
..                                                 ...       ...
106  [IME-4 Count, Average Weekly Wage, Age at Inju...  0.312162
105  [IME-4 Count, Average Weekly Wage, Age at Inju...  0.311841
126  [IME-4 Count, Average Weekly Wage, Birth Year,...  0.311494
69   [IME-4 Count, Average Weekly Wage, Age at Inju...  0.311277
123  [IME-4 Count, Average Weekly Wage, Age at Inju...  0.308974

[127 rows x 2 columns]


In [213]:
""" dt = DecisionTreeClassifier().fit(X_train_combined, y_train_encoded)
predictions_train = dt.predict(X_train_combined)
predictions_val = dt.predict(X_val_combined) """

' dt = DecisionTreeClassifier().fit(X_train_combined, y_train_encoded)\npredictions_train = dt.predict(X_train_combined)\npredictions_val = dt.predict(X_val_combined) '

In [214]:
""" print(dt.score(X_train_combined, y_train_encoded))
print(confusion_matrix(y_train_encoded, predictions_train))
print(f1_score(y_train_encoded, predictions_train, average='macro')) """

" print(dt.score(X_train_combined, y_train_encoded))\nprint(confusion_matrix(y_train_encoded, predictions_train))\nprint(f1_score(y_train_encoded, predictions_train, average='macro')) "

In [215]:
""" print(dt.score(X_val_combined, y_val_encoded))
print(confusion_matrix(y_val_encoded, predictions_val))
print(f1_score(y_val_encoded, predictions_val, average='macro')) """

" print(dt.score(X_val_combined, y_val_encoded))\nprint(confusion_matrix(y_val_encoded, predictions_val))\nprint(f1_score(y_val_encoded, predictions_val, average='macro')) "

In [216]:
X_train_log = X_train_combined.drop(columns=log)
X_val_log = X_val_combined.drop(columns=log)

In [217]:
X_train_lasso = X_train_combined.drop(columns=lasso)
X_val_lasso = X_val_combined.drop(columns=lasso)

In [218]:
log = LogisticRegression().fit(X_train_log, y_train_encoded)
predictions_train_log = log.predict(X_train_log)
predictions_val_log = log.predict(X_val_log)

In [219]:
lasso = LassoCV().fit(X_train_lasso, y_train_encoded)
predictions_train_lasso = lasso.predict(X_train_lasso)
predictions_val_lasso = lasso.predict(X_val_lasso)

In [220]:
print(log.score(X_train_log, y_train_encoded))
print(confusion_matrix(y_train_encoded, predictions_train_log))
print(f1_score(y_train_encoded, predictions_train_log, average='macro'))

print("---------------------")

print(log.score(X_val_log, y_val_encoded))
print(confusion_matrix(y_val_encoded, predictions_val_log))
print(f1_score(y_val_encoded, predictions_val_log, average='macro'))

0.5129637541482026
[[    13   7604     49    839    228      0      0      0]
 [     7 196834    126   7038    516      0      0      0]
 [     0  44048     18   4102     66      0      0      0]
 [    28  93341    269   9632    685      0      0      0]
 [     0  30640      1   3141     14      0      0      0]
 [     0   2778      0    164      6      0      0      0]
 [     0     63      0      4      1      0      0      0]
 [     0    215      0    114      0      0      0      0]]
0.10406570599619125
---------------------
0.5131247210743203
[[   10  3247    19   356   111     0     0     0]
 [    4 84381    60  3004   204     0     0     0]
 [    0 18870    12  1761    29     0     0     0]
 [    7 40012   117  4116   300     0     0     0]
 [    0 13215     0  1255    14     0     0     0]
 [    0  1162     0   101     0     0     0     0]
 [    0    24     0     5     0     0     0     0]
 [    0    98     0    43     0     0     0     0]]
0.10451771605496826


In [221]:
print(log.score(X_val_log, y_val_encoded))
print(confusion_matrix(y_val_encoded, predictions_val_log))
print(f1_score(y_val_encoded, predictions_val_log, average='macro'))

print("---------------------")

print(lasso.score(X_train_lasso, y_train_encoded))
print(confusion_matrix(y_train_encoded, predictions_train_lasso))
print(f1_score(y_train_encoded, predictions_train_lasso, average='macro'))

0.5131247210743203
[[   10  3247    19   356   111     0     0     0]
 [    4 84381    60  3004   204     0     0     0]
 [    0 18870    12  1761    29     0     0     0]
 [    7 40012   117  4116   300     0     0     0]
 [    0 13215     0  1255    14     0     0     0]
 [    0  1162     0   101     0     0     0     0]
 [    0    24     0     5     0     0     0     0]
 [    0    98     0    43     0     0     0     0]]
0.10451771605496826
---------------------
0.34299485929064666


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

# Melhor combinação de colunas a remover: ['Age at Injury', 'Number of Dependents', 'WCB Decision', 'WCIO Cause of Injury Code']
# F1_score correspondente: 0.3916966060030664