Важность признаков, логистическая регрессия, python, случайный лес, sklearn, разреженная матрица, xgboost

Важность функции — это оценка, присваиваемая функциям модели машинного обучения, которая определяет, насколько «важным» является признак для прогноза модели. Это может помочь в выборе функций, и мы можем получить очень полезную информацию о наших данных. Мы покажем вам, как его можно получить в самых распространенных моделях машинного обучения.

https://predictivehacks.com/feature-importance-in-python/

In [None]:
import pandas as pd
import numpy as np
 
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
 
#we used only the train dataset from Titanic
data=pd.read_csv('train.csv')
data=data[['Sex','Age','Embarked','Pclass','SibSp','Parch','Survived']]
data.dropna(inplace=True)

In [None]:
model=LogisticRegression(random_state=1)
 
features=pd.get_dummies(data[['Sex','Embarked','Pclass','SibSp','Parch']],drop_first=True)
features['Age']=data['Age']
 
model.fit(features,data['Survived'])
 
feature_importance=pd.DataFrame({'feature':list(features.columns),'feature_importance':[abs(i) for i in model.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)

In [None]:
model=RandomForestClassifier()
 
model.fit(features,data['Survived'])
 
feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)

In [None]:
model=smf.logit('Survived~Sex+Age+Embarked+Pclass+SibSp+Parch',data=data)
result = model.fit()
 
feature_importances=pd.DataFrame(result.conf_int()[1]).rename(columns={1:'Coefficients'}).eval("absolute_coefficients=abs(Coefficients)")
feature_importances.sort_values('absolute_coefficients',ascending=False).drop('Intercept')[['absolute_coefficients']]

In [None]:
model=XGBClassifier()
 
model.fit(features,data['Survived'])
 
feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

In [None]:
v = CountVectorizer(ngram_range=(1,1))
x = v.fit_transform(df['Message'])
 
 
model=LogisticRegression()
model.fit(x,df['Category'])
 
#we are not getting the absolute value
feature_importance=pd.DataFrame({'feature':v.get_feature_names(),'feature_importance':model.coef_[0]})
feature_importance.sort_values('feature_importance',ascending=False).head(10)

In [None]:
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

svm = SVC(kernel='rbf', random_state=0, gamma=.10, C=1.0)
svm.fit(X_train, Y_train)

perm_importance = permutation_importance(svm, X_test, Y_test)

feature_names = ['length', 'width']
features = np.array(feature_names)

sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")