In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import zipfile
import urllib.request

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
%matplotlib inline

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from imblearn.metrics import classification_report_imbalanced

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgb

from sklearn.metrics import (precision_score, recall_score, roc_auc_score, accuracy_score,
                             confusion_matrix, precision_recall_curve, roc_curve, brier_score_loss)

# from sklearn.externals import joblib

In [2]:

import lightgbm as lgb


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
users_emotion_damf_tox_10 = pd.read_csv('/content/drive/MyDrive/NLP HW/Project/users_emotion_damf_tox_10.csv')

In [None]:
users_emotion_damf_tox_10

In [None]:
users_emotion_damf_tox_10.category.value_counts()

In [None]:
def RF_pred(X, y):
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=RandomForestClassifier(),
        param_distributions={
            'n_estimators': range(1, 200, 10),
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']}, cv=3, scoring='roc_auc', n_jobs=-1)
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y)
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)
    # Build models with optimized hyperparameters
    model_RF = RandomForestClassifier(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"])
    # Split dataset into 5 consecutive folds
    kf = KFold(n_splits=3, shuffle=True, random_state=None)
    i = 1
    for train, test in kf.split(X):
        X_train = X.iloc[train,:]
        y_train = y.iloc[train,:]
        X_test = X.iloc[test]
        y_test = y.iloc[test]
        model_RF.fit(X_train, y_train)
        train_pred = model_RF.predict(X_train)
        y_pred = model_RF.predict(X_test)
        train_accuracy = accuracy_score(y_train, train_pred)
        train_precision = precision_score(y_train, train_pred)
        train_recall = recall_score(y_train, train_pred)
        train_auc = roc_auc_score(y_train, train_pred)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred)
        test_recall = recall_score(y_test, y_pred)
        test_auc = roc_auc_score(y_test, y_pred)
        print('Fold '+ str(i), ':  Training accuracy: ', train_accuracy, 'Testing accuracy: ', test_accuracy)
        print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
        print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
        print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)
        i += 1
    return model_RF

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def RF_pred_gpu(X, y):
    # Convert pandas dataframes to PyTorch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32).cuda()
    y_tensor = torch.tensor(y.values, dtype=torch.float32).cuda()

    # Build models with hyperparameters sets using scikit-learn's RandomizedSearchCV
    RSC = RandomizedSearchCV(
        estimator=RandomForestClassifier(),
        param_distributions={
            'n_estimators': list(range(1, 200, 10)),
            'max_depth': list(range(1, 100, 10)),
            'max_features': ['auto', 'sqrt', 'log2']
        },
        cv=3, scoring='roc_auc', n_jobs=-1)

    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y)
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)

    # Build models with optimized hyperparameters
    model_RF = RandomForestClassifier(
        n_estimators=search_result.best_params_["n_estimators"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"])

    # Split dataset into 3 consecutive folds
    kf = KFold(n_splits=5, shuffle=True, random_state=None)
    i = 1
    for train, test in kf.split(X):
        X_train = X_tensor[train, :]
        y_train = y_tensor[train]
        X_test = X_tensor[test, :]
        y_test = y_tensor[test]

        # Convert PyTorch tensors to NumPy arrays for scikit-learn's RandomForestClassifier
        X_train_numpy = X_train.cpu().numpy()
        y_train_numpy = y_train.cpu().numpy()
        X_test_numpy = X_test.cpu().numpy()
        y_test_numpy = y_test.cpu().numpy()

        # Fit the model on CPU (scikit-learn's RandomForestClassifier doesn't support GPU)
        model_RF.fit(X_train_numpy, y_train_numpy)

        # Predictions on CPU
        train_pred = model_RF.predict(X_train_numpy)
        y_pred = model_RF.predict(X_test_numpy)

        # Calculate metrics
        train_accuracy = accuracy_score(y_train_numpy, train_pred)
        train_precision = precision_score(y_train_numpy, train_pred)
        train_recall = recall_score(y_train_numpy, train_pred)
        train_auc = roc_auc_score(y_train_numpy, train_pred)

        test_accuracy = accuracy_score(y_test_numpy, y_pred)
        test_precision = precision_score(y_test_numpy, y_pred)
        test_recall = recall_score(y_test_numpy, y_pred)
        test_auc = roc_auc_score(y_test_numpy, y_pred)

        print('Fold ' + str(i), ':  Training accuracy: ', train_accuracy, 'Testing accuracy: ', test_accuracy)
        print('Fold ' + str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
        print('Fold ' + str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
        print('Fold ' + str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)

        i += 1

    return model_RF

In [None]:
def DT_pred(X, y):
    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=DecisionTreeClassifier(),
        param_distributions={
            'criterion': ['gini', 'entropy'],
            'max_depth': range(1, 100, 10),
            'max_features': ['auto', 'sqrt', 'log2']},
        cv=3, scoring='roc_auc', n_jobs=-1, verbose = True)
    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y)
    print("Best using: ", search_result.best_params_, "Score: ", search_result.best_score_)
    # Build models with optimized hyperparameters
    model_DT = DecisionTreeClassifier(
        criterion=search_result.best_params_["criterion"],
        max_depth=search_result.best_params_["max_depth"],
        max_features=search_result.best_params_["max_features"])
    # Split dataset into 3 consecutive folds
    kf = KFold(n_splits=3, shuffle=True, random_state=None)
    i = 1
    for train, test in kf.split(X):
        X_train = X.iloc[train,:]
        y_train = y.iloc[train,:]
        X_test = X.iloc[test]
        y_test = y.iloc[test]
        model_DT.fit(X_train, y_train)
        train_pred = model_DT.predict(X_train)
        y_pred = model_DT.predict(X_test)
        train_accuracy = accuracy_score(y_train, train_pred)
        train_precision = precision_score(y_train, train_pred)
        train_recall = recall_score(y_train, train_pred)
        train_auc = roc_auc_score(y_train, train_pred)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred)
        test_recall = recall_score(y_test, y_pred)
        test_auc = roc_auc_score(y_test, y_pred)
        print('Fold '+ str(i), ':  Training accuracy: ', train_accuracy, 'Testing accuracy: ', test_accuracy)
        print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
        print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
        print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)
        i += 1
    return model_DT

In [None]:
def LGB_pred(X, y):

    # Build models with hyperparameters sets
    RSC = RandomizedSearchCV(
        estimator=lgb.LGBMClassifier(),
        param_distributions = { 'boosting_type': ['gbdt', 'goss', 'dart'],
                      'num_leaves': range(10, 500, 25),
                      'bagging_fraction': [0.1, 0.3, 0.5, 0.7, 0.9],
                      'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5],
                      'min_data': [200, 300, 400, 500, 600],
                      'max_bin': [3, 5, 10, 12, 15, 18, 20, 22],
                      'lambda_l1': [1, 10, 20, 30, 40],
                      'feature_fraction': [0.5, 0.7, 0.8, 0.9],
                      'max_depth': range(1, 50, 10)}, cv=3, scoring='roc_auc', n_jobs=-1)

    # Fit RandomizedSearchCV to find best hyperparameters
    search_result = RSC.fit(X, y)
    print("Best: using", search_result.best_score_, search_result.best_params_)

    # Build models with optimized hyperparameters
    model_LGB = lgb.LGBMClassifier(
        boosting_type=search_result.best_params_["boosting_type"],
        num_leaves=search_result.best_params_["num_leaves"],
        bagging_fraction=search_result.best_params_["bagging_fraction"],
        learning_rate=search_result.best_params_["learning_rate"],
        min_data=search_result.best_params_["min_data"],
        max_bin=search_result.best_params_["max_bin"],
        lambda_l1=search_result.best_params_["lambda_l1"],
        feature_fraction=search_result.best_params_["feature_fraction"],
        max_depth=search_result.best_params_["max_depth"])


    # Split dataset into 5 consecutive folds
    kf = KFold(n_splits=3, shuffle=True, random_state=None)

    i = 1
    for train, test in kf.split(X):
        X_train = X.iloc[train,:]
        y_train = y.iloc[train,:]
        X_test = X.iloc[test]
        y_test = y.iloc[test]
        model_LGB.fit(X_train, y_train)
        train_pred = model_LGB.predict(X_train)
        y_pred = model_LGB.predict(X_test)

        train_accuracy = accuracy_score(y_train, train_pred)
        train_precision = precision_score(y_train, train_pred)
        train_recall = recall_score(y_train, train_pred)
        train_auc = roc_auc_score(y_train, train_pred)

        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred)
        test_recall = recall_score(y_test, y_pred)
        test_auc = roc_auc_score(y_test, y_pred)

        print('Fold '+ str(i), ':  Training accuracy: ', train_accuracy, 'Testing accuracy: ', test_accuracy)
        print('Fold '+ str(i), ':  Training precision: ', train_precision, 'Testing precision: ', test_precision)
        print('Fold '+ str(i), ':  Training recall: ', train_recall, 'Testing accuracy: ', test_recall)
        print('Fold '+ str(i), ':  Training auc: ', train_auc, 'Testing auc: ', test_auc)

        i += 1

    return model_LGB

In [None]:
users_emotion_damf_tox_10.columns

In [None]:
X = users_emotion_damf_tox_10[['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
       'optimism', 'pessimism', 'sadness', 'surprise', 'trust', 'None',
       'Profanity', 'Identity Attack', 'Insult', 'Threat', 'Toxic', 'care',
       'harm', 'fairness', 'cheating', 'loyalty', 'betrayal', 'authority',
       'subversion', 'purity', 'degradation']]

# X = users_emotion_damf_tox[['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
#        'optimism', 'pessimism', 'sadness', 'surprise', 'trust', 'None',
#        'Profanity', 'Identity Attack', 'Insult', 'Threat', 'Toxic']]
y = users_emotion_damf_tox_10[['label']]

In [None]:
# sm = SMOTE(random_state=4)
# X_smote, y_smote = SMOTE().fit_sample(X, y)

# print("Original data distribution: ")
# print(y.label.value_counts())
# print("SMOTE data distribution: ")
# print(y_smote.label.value_counts())

In [None]:
# data = X_smote
# data['label'] = y_smote
# data

## Building Models

In [None]:
def plot_performance(y_test, y_pred, y_pred_prob):


    test_fpr, test_tpr, _ = roc_curve(y_test, y_pred_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)

    # ROC Curve
    fig = plt.figure(1, figsize=(10,5))
    plt.subplot(1, 2, 1)
    plt.plot(test_fpr, test_tpr, label="ROC (area = %0.4f)" % roc_auc_score(y_test, y_pred), color="blue", lw=2)
    plt.plot([0, 1], [0, 1], "k--")
    plt.legend(loc="lower right")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")


    # Precision Recall Curve
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, marker='.', color="blue", lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title("Precision Recall Curve")


    plt.tight_layout()
    plt.show()

In [None]:
def plot_confusion(y_test, y_pred):

    cm = confusion_matrix(y_test, y_pred)
#     fig = plt.figure(1, figsize=(10,5))

    # Confusion Matrix
    fig = plt.figure(figsize=(10,10))
    ax =  fig.add_subplot(1,1,1, adjustable='box', aspect=1)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.colorbar()

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    np.set_printoptions(precision=2)
    plt.show()


In [None]:
def results(y_test, y_pred, y_pred_prob):
    test_fpr, test_tpr, _ = roc_curve(y_test, y_pred_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    return test_fpr,test_tpr,precision,recall,roc

## Unbalanced

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.30)

In [None]:
# Decision Tree performance
model_DT = DT_pred(X_train, y_train)

y_pred = model_DT.predict(X_test)
y_pred_prob = model_DT.predict_proba(X_test)
y_pred_prob = y_pred_prob[:, 1]

dt_fpr,dt_tpr,dt_prec,dt_rec,dt_roc = results(y_test, y_pred, y_pred_prob)

plot_performance(y_test, y_pred, y_pred_prob)
plot_confusion(y_test, y_pred)

In [None]:
# Random Forest performance
model_RF = RF_pred_gpu(X_train, y_train)
y_pred = model_RF.predict(X_test)
y_pred_prob = model_RF.predict_proba(X_test)
y_pred_prob = y_pred_prob[:, 1]

rf_fpr,rf_tpr,rf_prec,rf_rec,rf_roc = results(y_test, y_pred, y_pred_prob)


plot_performance(y_test, y_pred, y_pred_prob)
plot_confusion(y_test, y_pred)

In [None]:

# LGB performance
model_LGB = LGB_pred(X_train, y_train)

y_pred = model_LGB.predict(X_test)
y_pred_prob = model_LGB.predict_proba(X_test)
y_pred_prob = y_pred_prob[:, 1]

lgb_fpr,lgb_tpr,lgb_prec,lgb_rec,lgb_roc = results(y_test, y_pred, y_pred_prob)

plot_performance(y_test, y_pred, y_pred_prob)
plot_confusion(y_test, y_pred)

In [None]:
dic = {'classifiers':['Decision Tree','Random Forest','LightGBM'],
      'fpr':[dt_fpr,rf_fpr,lgb_fpr],'tpr':[dt_tpr,rf_tpr,lgb_tpr],'auc':[dt_roc,rf_roc,lgb_roc]}
res_df = pd.DataFrame(dic)
res_df

In [None]:
res_df.set_index('classifiers', inplace=True)
res_df

In [None]:
fig = plt.figure(figsize=(8,6))

for i in res_df.index:
    plt.plot(res_df.loc[i]['fpr'],
             res_df.loc[i]['tpr'],
             label="{}, AUC={:.3f}".format(i, res_df.loc[i]['auc']))

plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

In [None]:
print(lgb_prec,lgb_rec)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot( dt_rec,dt_prec, label='Decision Tree')
ax.plot( rf_rec,rf_prec, label='Random Forest')
ax.plot(lgb_rec, lgb_prec, label='LightGBM')

baseline = len(y_test[y_test==1]) / len(y_test)
ax.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision Recall Curve')
ax.legend(loc='center left');

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm= confusion_matrix(y_test, y_pred)
plt.figure(figsize=[10,8])
plt.title('Confusion matrix of the LightGBM classifier')
sns.heatmap(cm,annot=True,fmt=".1f")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.ioff()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1-Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

## Balanced

In [None]:
sm = SMOTE(random_state=4)
X_smote, y_smote = SMOTE().fit_sample(X, y)

print("Original data distribution: ")
print(y.label.value_counts())
print("SMOTE data distribution: ")
print(y_smote.label.value_counts())

In [None]:
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X, y, random_state=2, test_size=0.30)

In [None]:
# Decision Tree performance
model_DT_bal = DT_pred(X_train_bal, y_train_bal)

y_pred_bal = model_DT_bal.predict(X_test_bal)
y_pred_prob_bal = model_DT_bal.predict_proba(X_test_bal)
y_pred_prob_bal = y_pred_prob_bal[:, 1]

dt_fpr_bal,dt_tpr_bal,dt_prec_bal,dt_rec_bal,dt_roc_bal = results(y_test_bal, y_pred_bal, y_pred_prob_bal)

plot_performance(y_test_bal, y_pred_bal, y_pred_prob_bal)
plot_confusion(y_test_bal, y_pred_bal)

In [None]:
# Random Forest performance
model_RF_bal = RF_pred(X_train_bal, y_train_bal)
y_pred_bal = model_RF_bal.predict(X_test_bal)
y_pred_prob_bal = model_RF_bal.predict_proba(X_test_bal)
y_pred_prob_bal = y_pred_prob_bal[:, 1]

rf_fpr_bal,rf_tpr_bal,rf_prec_bal,rf_rec_bal,rf_roc_bal = results(y_test_bal, y_pred_bal, y_pred_prob_bal)


plot_performance(y_test_bal, y_pred_bal, y_pred_prob_bal)
plot_confusion(y_test_bal, y_pred_bal)

In [None]:
# LGB performance
model_LGB_bal = LGB_pred(X_train_bal, y_train_bal)

y_pred_bal = model_LGB_bal.predict(X_test_bal)
y_pred_prob_bal = model_LGB_bal.predict_proba(X_test_bal)
y_pred_prob_bal = y_pred_prob_bal[:, 1]

lgb_fpr_bal,lgb_tpr_bal,lgb_prec_bal,lgb_rec_bal,lgb_roc_bal = results(y_test_bal, y_pred_bal, y_pred_prob_bal)

plot_performance(y_test_bal, y_pred_bal, y_pred_prob_bal)
plot_confusion(y_test_bal, y_pred_bal)

In [None]:
dic = {'classifiers':['Decision Tree','Random Forest','LightGBM'],
      'fpr':[dt_fpr_bal,rf_fpr_bal,lgb_fpr_bal],'tpr':[dt_tpr_bal,rf_tpr_bal,lgb_tpr_bal],
       'auc':[dt_roc_bal,rf_roc_bal,lgb_roc_bal]}
res_df_bal = pd.DataFrame(dic)
res_df_bal

In [None]:
res_df_bal.set_index('classifiers', inplace=True)
res_df_bal

In [None]:
fig = plt.figure(figsize=(8,6))

for i in res_df_bal.index:
    plt.plot(res_df_bal.loc[i]['fpr'],
             res_df_bal.loc[i]['tpr'],
             label="{}, AUC={:.3f}".format(i, res_df_bal.loc[i]['auc']))

plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

In [None]:
print(lgb_prec_bal,lgb_rec_bal)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot( dt_rec_bal,dt_prec_bal, label='Decision Tree')
ax.plot( rf_rec_bal,rf_prec_bal, label='Random Forest')
ax.plot(lgb_rec_bal, lgb_prec_bal, label='LightGBM')

baseline = len(y_test_bal[y_test_bal==1]) / len(y_test_bal)
ax.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision Recall Curve')
ax.legend(loc='center left');

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm= confusion_matrix(y_test_bal, y_pred_bal)
plt.figure(figsize=[10,8])
plt.title('Confusion matrix of the LightGBM classifier')
sns.heatmap(cm,annot=True,fmt=".1f")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.ioff()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
print('Precision: %.3f' % precision_score(y_test_bal, y_pred_bal))
print('Recall: %.3f' % recall_score(y_test_bal, y_pred_bal))
print('F1-Score: %.3f' % f1_score(y_test_bal, y_pred_bal))
print('Accuracy: %.3f' % accuracy_score(y_test_bal, y_pred_bal))

## Feature Selection

https://machinelearningmastery.com/feature-selection-machine-learning-python/

In [None]:

# Feature Selection with Univariate Statistical Tests
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
importance = model_LGB.feature_importances_

In [None]:
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(8,6))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.show()

In [None]:
plot_feature_importance(model_LGB.feature_importances_,X.columns,'LightGBM ')