In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import all the tools we need


# Regular EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


# we want our plots to apear within notebooks
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style("darkgrid")


# Models from scikit-learn & XGboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2


# Model Evaluation libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, accuracy_score
from sklearn.metrics import plot_roc_curve
# For Hyperparameter tunning of Xgboost
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
df = pd.read_csv("../input/vehicle-claim-fraud-detection/fraud_oracle.csv")
df.head()

# EDA (Exploratory Data Analysis)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df["FraudFound_P"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
df["FraudFound_P"].value_counts().plot(kind="bar", color=['salmon', 'lightblue'])

In [None]:
df["Sex"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
df["Fault"].value_counts().plot(kind="bar")
plt.xticks(rotation = 0);

In [None]:
df.AccidentArea.value_counts()

In [None]:
# Let's visualize the AccidentArea
plt.figure(figsize=(12,6))
df.AccidentArea.value_counts().plot(kind="bar",color=["salmon","lightblue"])
plt.xticks(rotation=0)

In [None]:
# Let's find out VehicleCategory
plt.figure(figsize=(10,6))
df.VehicleCategory.value_counts().plot(kind="bar",color=["green","pink","navy"])

In [None]:
df.AgeOfVehicle.value_counts()

In [None]:
# Let's Visualize the age of Vehicle
plt.figure(figsize=(12,6))
df.AgeOfVehicle.value_counts().plot(kind="bar")
plt.xticks(rotation=0);

In [None]:
df.WitnessPresent.value_counts()

In [None]:
# Let's Visulaize the WitnessPresent On Accident Site
plt.figure(figsize=(12,7))
df.WitnessPresent.value_counts().plot(kind="bar", color=["pink","red"])

In [None]:
df.PoliceReportFiled.value_counts()

In [None]:
# Let's Check out the Police Report Filed with barplot
plt.figure(figsize=(12,7))
df.PoliceReportFiled.value_counts().plot(kind="bar",color=["lightblue","darkred"])

In [None]:
df.DriverRating.value_counts()

In [None]:
df.VehiclePrice.value_counts()

#### we need find corr() between features

In [None]:
df.corr()

In [None]:
# Let's viualize the corr between independent variables and with dependant (target variable)
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(), annot=True, cmap="plasma_r")

In [None]:
# Let's check it with crosstab 
#(crosstab() function takes two or more lists, pandas series or dataframe columns and returns a frequency of each combination by default)

pd.crosstab(df.FraudFound_P, df.Sex)

In [None]:
# Create a plot of crosstab for FraudFound and Sex Columns
pd.crosstab(df.FraudFound_P, df.Sex).plot(kind="bar",
                                          color=["salmon","lightblue"],
                                          figsize=(12,6))

plt.title("Fraud Frequency For Gender")
plt.xlabel("Sex = Male & Female")
plt.ylabel("Amount")
plt.legend(["Female","Male"])
plt.xticks(rotation = 0);

In [None]:
# let's plot 
pd.crosstab(df.FraudFound_P, df.Fault).plot(kind="bar",
                                            color = ["salmon","lightblue"],
                                            figsize=(12,7))
plt.xticks(rotation = 0);

In [None]:
# Let's check FraudFound with Age using barplot
gpd_by_val=df.groupby('Age').agg({'FraudFound_P':'sum'}).reset_index()

fig, (ax1) = plt.subplots(1,1,figsize=(22, 6))
grph =sns.barplot(x='Age', y='FraudFound_P', data = gpd_by_val, ax=ax1)

grph.set_xticklabels(grph.get_xticklabels(),
                    rotation=0,
                    horizontalalignment='right'
                    );

In [None]:
# Let's Check the AgentType, Year, and daysPolicy with FraudFound using barplot
gpd_val1=df.groupby('AgentType').agg({'FraudFound_P':'sum'}).reset_index()
gpd_val2=df.groupby('Year').agg({'FraudFound_P':'sum'}).reset_index()
gpd_val3=df.groupby('Days_Policy_Accident').agg({'FraudFound_P':'sum'}).reset_index()


fig, (ax1,ax2,ax3) = plt.subplots(1,3,figsize=(22, 6))
sns.barplot(x='AgentType', y='FraudFound_P', data = gpd_val1, ax=ax1)
sns.barplot(x='Year', y='FraudFound_P', data = gpd_val2, ax=ax2)
sns.barplot(x='Days_Policy_Accident', y='FraudFound_P', data = gpd_val3, ax=ax3)

In [None]:
#Let's Plot the PoliceReportFiled, Days_Policy_Claim & DayOfWeek with FraudFound using Barplot
gpd_val4=df.groupby('PoliceReportFiled').agg({'FraudFound_P':'sum'}).reset_index()
gpd_val5=df.groupby('Days_Policy_Claim').agg({'FraudFound_P':'sum'}).reset_index()

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(20, 7))
sns.barplot(x='PoliceReportFiled', y='FraudFound_P', data = gpd_val4, ax=ax1)
sns.barplot(x='Days_Policy_Claim', y='FraudFound_P', data = gpd_val5, ax=ax2)

In [None]:
# Let's plot the WitnessPresent, WeekOfMonthClaimed & DayOFWeekClaimed with FraudFound using barplot
gpd_val8=df.groupby('WeekOfMonthClaimed').agg({'FraudFound_P':'sum'}).reset_index()
gpd_val9=df.groupby('DayOfWeekClaimed').agg({'FraudFound_P':'sum'}).reset_index()


fig, (ax1,ax2) = plt.subplots(1,2,figsize=(22, 6))
sns.barplot(x='WeekOfMonthClaimed', y='FraudFound_P', data = gpd_val8, ax=ax1)
sns.barplot(x='DayOfWeekClaimed', y='FraudFound_P', data = gpd_val9, ax=ax2)
plt.xticks(rotation=45);

In [None]:
# Let's plot DriverRating, NumberOfCars & WinessPresent with barplot
gpd_val_10 = df.groupby("DriverRating").agg({"FraudFound_P":"sum"}).reset_index()
gpd_val_11 = df.groupby("NumberOfCars").agg({"FraudFound_P":"sum"}).reset_index()
gpd_val_12 = df.groupby("WitnessPresent").agg({"FraudFound_P":"sum"}).reset_index()
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(24,7))

sns.barplot(x="DriverRating", y="FraudFound_P", data = gpd_val_10, ax= ax1)
sns.barplot(x="NumberOfCars", y="FraudFound_P", data = gpd_val_11, ax= ax2)
sns.barplot(x='WitnessPresent', y='FraudFound_P', data = gpd_val_12, ax= ax3)     


In [None]:
# Let's Plot the AddressChangeClaim & PastNumberOfClaims using Barplot
gpd_val_12 = df.groupby("AddressChange_Claim").agg({"FraudFound_P": "sum"}).reset_index()
gpd_val_13 = df.groupby("PastNumberOfClaims").agg({"FraudFound_P": "sum"}).reset_index()

fig , (ax1, ax2) = plt.subplots(1,2, figsize=(20,7))

sns.barplot(x="AddressChange_Claim", y="FraudFound_P", data=gpd_val_12, ax=ax1)
sns.barplot(x="PastNumberOfClaims", y="FraudFound_P", data=gpd_val_13, ax=ax2)


In [None]:
df.info()

# Build Model

* First We Need To Remove The 0 Values in Dataset

In [None]:
# We have 0 values only in Age column
print(df['Age'].unique()==0)
len(df[df['Age']==0])

In [None]:
# Make a copy of original dataframe for preprocessing
df_temp = df.copy()
# Finding columns which contains strings
for labels, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        print(labels)

In [None]:
# Now Let's Fill 0 value with median of the column
df_temp["Age"] = df_temp["Age"].replace(0,df["Age"].median())

In [None]:
# Let's Check if we have 0 in Age
print(df_temp['Age'].unique()==0)
len(df_temp[df_temp['Age']==0])

* Now We Need To Turn All Strings Into Numerical Values For Machine Learning Model

In [None]:
# Now turn all string into categorical features
for labels, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        df_temp[labels] = content.astype("category").cat.as_ordered()

In [None]:
# Let's Check how many columns changed into Category
df_temp.info()

In [None]:
# Turn Categorical Features values into numeric
for labels, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing values 
       #df_temp[labels+"_is_missing"] = pd.isnull(content) # Though we don't have but we could've used if we had missing values
        df_temp[labels] = pd.Categorical(content).codes

In [None]:
df_temp.head()

### Now Out Dataset is Ready For Algorithms 

In [None]:
df_temp.describe()

### Splitting Data

In [None]:
# Split data into X & y
X = df_temp.drop("FraudFound_P",axis = 1)
y = df_temp["FraudFound_P"]

In [None]:
X

In [None]:
y

In [None]:
# Setup random seed for reproduction
np.random.seed(42)

# Split data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.2)


In [None]:
# Let's Check our Train and Test Split
from collections import Counter
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## As we can see our Target Feature is totally unbalanced so we would apply scikit-learn function to balance the weight of classes

In [None]:
# Let's import the function
from sklearn.utils import class_weight
class_weights = dict(zip(np.unique(y_train), class_weight.compute_class_weight(
                                                                        class_weight='balanced',
                                                                        classes= np.unique(y_train), 
                                                                        y = y_train)))
class_weights

In [None]:
# Let's put our models into dictionary 
models = {"Logistic Regression": LogisticRegression(class_weight=class_weights,solver = 'liblinear'),
          "KNN": KNeighborsClassifier(),
          "Random Forest Classifier": RandomForestClassifier(class_weight=class_weights),
          "XGboost": XGBClassifier()}

# Let's create a function to fit and later score our models
def fit_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates the given machine learning models
    """
    # random seed for reproduction
    np.random.seed(42)
    
    # Let's create a empty dictionary to keep model score
    model_score = {}
    
    # Let's loop through the models dictionary
    for name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        # Evaluate the score and append it
        model_score[name] = model.score(X_test,y_test)
    return model_score

In [None]:
model_score = fit_score(models=models,
                        X_train=X_train,
                        X_test=X_test,
                        y_train=y_train,
                        y_test=y_test)
model_score

In [None]:
# Save into DataFrame
model_compare = pd.DataFrame(model_score,index=["Accuracy"])
model_compare

In [None]:
# Let's Plot The Models and Compare
model_compare.T.plot(kind="bar")

### So far three algorithms performed pretty well 

* The first model is not the best model.
Let's tune all the algorithms if we can get more accuracy

In [None]:
# First Let's tune KNN
train_score = []
test_score  = []

# Let's create a list for different neighbors
neighbors = range(1, 21)

# Setup knn instance
knn = KNeighborsClassifier()

# loop through different neighbors
for i in neighbors:
    knn.set_params(n_neighbors = i)
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Update the training score list
    train_score.append(knn.score(X_train, y_train))
    
    # Update the test score list
    test_score.append(knn.score(X_test, y_test))

In [None]:
# Let's Plot And Viusalize The KNN Tunned Model
plt.plot(neighbors, train_score, label= "Train score")
plt.plot(neighbors, test_score, label= "Test score")
plt.xlabel("Neighbors")
plt.ylabel("Model Accuracy")
plt.legend()

print(f"Maximum KNN score on Test data: {max(test_score)*100 :.2f}%")

> So far K-nn has not improved so let's jump into another algorithm tunning

> This time we will tune hyperparameter with Scikit-learn libraries `RandomizedSearchCV` & `GridSearchCV`

## Hyperparameter with RandomizedSearchCV

In [None]:
# Create hyperparametergrid for LogisticRegression

log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Create hyperparameterGrid for RandomForestClassifier
rf_random_grid = {"n_estimators": np.arange(10,1000,50),
                  "max_depth": [None, 3, 5, 10],
                  "min_samples_split": np.arange(2, 20, 2),
                  "min_samples_leaf": np.arange(1, 20, 2)} 

In [None]:
# Let's Tune LogisticRegression
np.random.seed(42)

# Setup Random Hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(class_weight=class_weights),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search
rs_log_reg.fit(X_train, y_train)

In [None]:
# Let's Check The Best Parmeters
rs_log_reg.best_params_

In [None]:
# Let's Score the Tunned model
rs_log_reg.score(X_test, y_test)

* So far Logistic Regression has not improved that much 

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifer
rf_rs = RandomizedSearchCV(RandomForestClassifier(class_weight=class_weights),
                           param_distributions=rf_random_grid,
                           cv=5,
                           verbose=True,
                           n_iter=10)

# Fitting random hyperparamter search
rf_rs.fit(X_train, y_train)

In [None]:
# Let's Check The Best Parmeters
rf_rs.best_params_

In [None]:
# Let's Score the Tunned model
rf_rs.score(X_test, y_test)

In [None]:
# Let's compare it default Models score
model_score

## Let's use GridSearchCV for hyperparameter tunning

In [None]:
# Setup random seed
np.random.seed(42)

# Different  LogisticRegression Hyperparameter
log_reg_grid = {"C": np.logspace(-4,4,30),
              "solver": ["liblinear"]}

# Setup Grid Search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(class_weight=class_weights),
                          param_grid=log_reg_grid,
                          cv = 20,
                          n_jobs=-1,
                          verbose=True)

# Fit the GridSearch instance
gs_log_reg.fit(X_test, y_test)

In [None]:
# Let's Check The Best Parameters
gs_log_reg.best_params_

In [None]:
# Let's score the grid search model of logistic Regression
gs_log_reg.score(X_test,y_test)

> Now we can say that Logistic Regression is not the best model for our dataset

## Let's work with XGboost Classifier and see if we can get overall best score

In [None]:
# Let's Create  a dictionary with random Parameters for all XGBoostClassifier parameters
space={
    'objective': 'binary:logistic', 
    'use_label_encoder': False, 
    'base_score': 0.5,
    'booster': 'gbtree',
    'colsample_bylevel' : 1,
    'colsample_bynode' : 1,
    'colsample_bytree' : 1,
    'enable_categorical': False,
    'gamma': hp.uniform('gamma', 0,10),
    'gpu_id': -1,
    'importance_type': None,
    'interaction_constraints': '', 
    'learning_rate': 0.300000012, 
    'max_delta_step': 0,
    'max_depth': hp.randint("max_depth", 10)+3,
    'min_child_weight' : hp.randint('min_child_weight', 4)+1,
    'monotone_constraints': '()',
    'n_estimators': hp.randint('n_estimators', 150)+50,
    'n_jobs': -1,
    'num_parallel_tree':1, 
    'predictor':'auto', 
    'random_state': 0,
    'reg_alpha' : hp.randint('reg_alpha', 10),
    'reg_lambda' : hp.randint('reg_lambda', 10),
    'scale_pos_weight': 1,
    'subsample': 1,
    'tree_method': 'exact',
    'validate_parameters':1,
    'verbosity': None,
    'eval_metric': 'aucpr'
    }

In [None]:
# Let's Define a function for our Space Dictionary and train our model
def objective(space):
    clf_model= XGBClassifier(**space)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf_model.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf_model.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
# Let's Check the best hyperparameters
best_hyperparams

In [None]:
# Let's Create The optimized model with best hyperparameters of XGboost Classifier
clf_model_optimizied = XGBClassifier(
    objective= 'binary:logistic', 
    use_label_encoder= False, 
    base_score= 0.5, 
    booster= 'gbtree', 
    colsample_bylevel= 1, 
    colsample_bynode= 1, 
    colsample_bytree= 1, 
    enable_categorical= False, 
    gamma= best_hyperparams['gamma'], 
    gpu_id= -1, 
    importance_type= None, 
    interaction_constraints= '', 
    learning_rate= 0.300000012, 
    max_delta_step= 0, 
    max_depth= best_hyperparams['max_depth'], 
    min_child_weight= best_hyperparams['min_child_weight'], 
    monotone_constraints= '()',
    n_estimators= best_hyperparams['n_estimators'], 
    n_jobs= 4, 
    num_parallel_tree= 1, 
    predictor= 'auto', 
    random_state= 0, 
    reg_alpha= best_hyperparams['reg_alpha'], 
    reg_lambda= best_hyperparams['reg_lambda'], 
    scale_pos_weight= 1, 
    subsample= 1, 
    tree_method= 'exact', 
    validate_parameters= 1, 
    verbosity= None, 
    eval_metric= 'aucpr'
)
print(clf_model_optimizied.get_params())

In [None]:
# Let's Fit our optimized model
xgb_model = clf_model_optimizied.fit(X_train, y_train)

In [None]:
# Let's Predict on our Optimized model
y_preds = xgb_model.predict(X_test)
y_preds

## Evaluating our tunned machine learning classifier, beyond accuracy 

* ROC curve and AUC Score
* Confusion matrix
* Classification report 
* Precision 
* Recall
* F1-Score

and it would be great if Cross-validation used if possible

In [None]:
# Let's plot ROC Curve and calculate the AUC metric
plot_roc_curve(xgb_model, X_test,y_test);

In [None]:
print(confusion_matrix(y_test,y_preds))

In [None]:
print(classification_report(y_test,y_preds));

In [None]:
# Let's visualize the confusion matrix

def conf_plot(y_test, y_preds):
    """
    Plots a nice looking heatmap on seaborn
    """
    fix, ax = plt.subplots(figsize=(10,6))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True,
                     cbar=True,
                     fmt="g");
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

conf_plot(y_test, y_preds)  

## Let's calculate the evaluation matrix using cross-validation

#### We're going to calculate Accuracy, precision, recall, and f1-score of our model using cross-validattion and to do so we'll 

#### be using `cross_val_score()`

In [None]:
# Checking the best_parameters
xgb_model.get_params()

In [None]:
# Creating a new classifier with best parameters

final_clf_xgb = XGBClassifier(objective= 'binary:logistic', 
                              use_label_encoder= False, 
                              base_score= 0.5, 
                              booster= 'gbtree', 
                              colsample_bylevel= 1, 
                              colsample_bynode= 1, 
                              colsample_bytree= 1, 
                              enable_categorical= False, 
                              gamma= best_hyperparams['gamma'], 
                              gpu_id= -1, 
                              importance_type= None, 
                              interaction_constraints= '', 
                              learning_rate= 0.300000012, 
                              max_delta_step= 0, 
                              max_depth= 8, 
                              min_child_weight= 0, 
                              monotone_constraints= '()',
                              n_estimators= 41, 
                              n_jobs= 4, 
                              num_parallel_tree= 1, 
                              predictor= 'auto', 
                              random_state= 0, 
                              reg_alpha= 2, 
                              reg_lambda= 1, 
                              scale_pos_weight= 1, 
                              subsample= 1, 
                              tree_method= 'exact', 
                              validate_parameters= 1, 
                              verbosity= None, 
                              eval_metric= 'aucpr')

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Cross-validated accuracy
cv_acc = cross_val_score(final_clf_xgb,
                         X,
                         y,
                         cv=100,
                         scoring="accuracy")
# Let's take over all mean of the accuracy
cv_acc = np.mean(cv_acc)
print(f"The Accuracy for our XGboost Classifier is: {cv_acc : .2f}%")


# Cross-validated Precision
cv_precision = cross_val_score(final_clf_xgb,
                         X,
                         y,
                         cv = 100,
                         scoring="precision")
# Let's take over all mean of the Precision
cv_precision = np.mean(cv_precision)
print(f"The Precision for our XGboost Classifier is: {cv_precision : .2f}%")


# Cross-validated Recall
cv_recall = cross_val_score(final_clf_xgb,
                         X,
                         y,
                         cv = 100,
                         scoring="recall")
# Let's take over all mean of the Recall
cv_recall = np.mean(cv_recall)
print(f"The Recall for our XGboost Classifier is: {cv_recall : .2f}%")


# Cross-validated f1-score
cv_f1 = cross_val_score(final_clf_xgb,
                         X,
                         y,
                         cv = 100,
                         scoring="f1")
# Let's take over all mean of the Precision
cv_f1 = np.mean(cv_f1)
print(f"The f1-score for our XGboost Classifier is:{cv_f1 :.2f}%")

In [None]:
print(classification_report(y_test,y_preds));

### The reason Why we have less Accuracy, Precsion, Recall, and f1-score is because we have used cv=100 we can gain more if we increase the number of cv(cross-validation).

In [None]:
# Let's Fit the final optimized model
final_clf_xgb.fit(X_train, y_train)

In [None]:
# Check coeffeficient eg.(how the independent variables (X_train) contributes to predict target variable (y))
final_clf_xgb.feature_importances_

In [None]:
# Here we will plot the F-Score with Features using XGboost built-in function
plot_importance(final_clf_xgb, max_num_features=20);

In [None]:
# Helper function for plotting feature importance of our XGboost Classifier
# We would only plot first top 20 features
def plot_features(columns, importances, n=20):
    df_feat = (pd.DataFrame({"features":columns,
                             "features_importances": importances})
          .sort_values("features_importances",ascending=False)
          .reset_index(drop=True))
    
    # Plot the dataframe we created
    fig, ax = plt.subplots(figsize=(12,7))
    ax.barh(df_feat["features"][:n], df_feat["features_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Features Importance")
    ax.invert_yaxis()

In [None]:
plot_features(X_train.columns, final_clf_xgb.feature_importances_)

* We used many Algorithms but so far XGboost performed pretty well on this dataset though we have un-balanced data(only Target feature).
