# null values check


In [None]:
# getting null values
def nullvalues(data):
    ls = [] 
    for i in data.columns:
        if data[i].isnull().sum() > 0:
            ls.append([i, data[i].isnull().sum()])
    if len(ls) != 0:
        print('Null values are present in ', ls)
        for i in ls:  # getting Percentage Null values and their datatype
            nullpercent = i[1]/data.shape[0]
            nullpercent = round(nullpercent,6)
            dt = type(data[i[0]][1])
            print(f'Null percentage in {i[0]} is {nullpercent * 100} % and datatype {dt}')
            return ls
    else: print('No Null values')

# Box Cox Transformation

In [None]:
from scipy import stats
cols = [] # list of numerical columns to be normalized
for i in cols:
    df_train[i] = df_train[i].apply(lambda x: x + 1) # to remove 0 values from data,  if negative data is present proceed accordingly
    df_test[i] = df_test[i].apply(lambda x: x + 1)
    df_train[i], fit_lambda = stats.boxcox(df_train[i])
    df_test[i] = stats.boxcox(df_test[i], fit_lambda) # transform test data with same lambda that is fitted on training data

# Analysis of regression models

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
def analyse(model, x, y, label):
    y_pred = model.predict(x)
    residuals =  y_pred - y
    
    rmse = sqrt(mean_squared_error(y, y_pred))
    print('Model RMSE:',round(rmse, 4))

    r2=r2_score(y, y_pred)
    print('Model r2_score:',round(r2, 4))


    fig , ax = plt.subplots(1,3, figsize = (15,5))
    plt.suptitle(f'Model analysis for {label} data', fontsize=30)
    ax[0].set_title('Error Distribution')
    ax[0].set_xlabel('Error Values')
    p1 = sns.histplot(residuals, kde = True, bins = 50, ax = ax[0])
    
    ax[1].set_title('actual vs predictions')
    ax[1].set_ylabel('predictons')
    ax[1].set_xlabel('actual')
    p2 = sns.scatterplot(y, y_pred, ax = ax[1])
    ax[2].set_title('Residual plot')
    
    p3 = sns.scatterplot(y, residuals, ax = ax[2])
    ax[2].set_ylabel('residuals')
    ax[2].set_xlabel(y.columns[0])
    plt.tight_layout()

# Analysis of binary Classification models

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Function For analysing model
def analyse(model, x_data, y_data, data_title):
    pred = model.predict(x_data)
    cm = confusion_matrix(y_data, pred)
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    print(f'For {data_title} data')
    Accuracy  = (accuracy_score(y_data, pred))
    Precision  = (TP/(TP+FP))  
    Sensitivity =(TP/(TP+FN))
    Specificity = (TN/(TN+FP))
    F1 = 2*(TP/(TP+FN)*(TP/(TP+FP))/((TP/(TP+FP))+(TP/(TP+FN))))
    mat = pd.DataFrame(cm, index=[ 'Actual Negative','Actual Positive'], 
                                 columns=[ 'Predict Negative','Predict Positive'])
    
    print('Correct Predictions:' ,TP+TN)
    print('False Positives:', FP)
    print('False Negatives:', FN)
    print('Accuracy:',Accuracy)
    print('Precision:',Precision)
    print('Sensitivity:',Sensitivity)
    print('Specificity:',Specificity)
    print('F1:', F1)
    
    
    mat = pd.DataFrame(cm, index=[ 'Actual Negative','Actual Positive'], 
                                 columns=[ 'Predict Negative','Predict Positive'])
    y_pred = model.predict_proba(x_data)[:,1]
    fpr, tpr, thresholds = roc_curve(y_data, y_pred)
    print('ROC AUC score' ,roc_auc_score(y_data, y_pred))
    
    
    fig, axs = plt.subplots(1,2, figsize = (14,5))
    axs[0].set_title('Confusion Matrix')
    sns.heatmap(mat, annot = True, fmt = 'd', ax = axs[0])
    axs[1].set_title('Roc Auc Curve')
    sns.lineplot([0,1], [0,1], ax = axs[1], palette = 'rocket_r')
    sns.lineplot(fpr, tpr, ax = axs[1], palette = 'rocket_r' )
    axs[1].set_xlabel('False Positive Rate')
    axs[1].set_ylabel('True Positive Rate')
    plt.tight_layout()

# Feature importance scoring

In [None]:
# Feature importance scoring
import xgboost
import xgboost as xgb
from sklearn import metrics
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
def modelfit(alg, x, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if type(alg) == xgboost.sklearn.XGBClassifier:
        if useTrainCV:
            xgb_param = alg.get_xgb_params()
            xgtrain = xgb.DMatrix(x.values, label=y.values)
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds)
            print(cvresult)
            alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(x, y)
        
    #Predict training set:
    dtrain_predictions = alg.predict(x)
    dtrain_predprob = alg.predict_proba(x)[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print( "Accuracy : %.4g" % metrics.accuracy_score(y.values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
    
    
    if type(alg) == sklearn.linear_model._logistic.LogisticRegression:
        # get importance
        importance = alg.coef_[0]
        feat_imp=pd.DataFrame({
        'columns':x.columns,
        'importance':importance})
        feat_imp.sort_values(by='importance', ascending=False)
        # plot feature importance
        plt.figure(figsize = (20,10))
        sns.barplot(x = 'columns', y = 'importance', data = feat_imp)
        plt.xticks(rotation=90)
    
    if type(alg) == sklearn.linear_model._base.LinearRegression:
        # get importance
        importance = alg.coef_
        feat_imp=pd.DataFrame({
        'columns':x.columns,
        'importance':importance})
        feat_imp.sort_values(by='importance', ascending=False)
        # plot feature importance
        plt.figure(figsize = (20,10))
        sns.barplot(x = 'columns', y = 'importance', data = feat_imp)
        plt.xticks(rotation=90)
        
    if type(alg) in [sklearn.tree._classes.DecisionTreeClassifier, xgboost.sklearn.XGBClassifier,
                     sklearn.ensemble._forest.RandomForestClassifier]:
        importance = alg.feature_importances_.argsort()
        importance = importance[::-1]
        feat_imp=pd.DataFrame({
        'cols':x.columns[importance],
        'imps':alg.feature_importances_[importance]})
        # plot feature importance
        plt.figure(figsize = (50,20))
        sns.barplot(x = 'cols', y = 'imps', data = feat_imp)
        plt.xticks(rotation=90)
    

# hyper parameter tuning

In [1]:
# Function For hyper parameter tuning of the model
from sklearn.model_selection import GridSearchCV   #Performing grid search
def bestvalues(model, x, y, parameters ,cvrounds = 5, scoring = 'roc_auc'):
    grid_search = GridSearchCV(estimator = model,  
                           param_grid = parameters,
                           scoring = scoring,
                           cv = cvrounds,
                           verbose=0)
    
    grid_search.fit(x,y)
    # best score achieved during the GridSearchCV
    print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))

# print parameters that give the best results
    print('Parameters that give the best results :','\n\n', (grid_search.best_params_))

# print estimator that was chosen by the GridSearch
    print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))
    return grid_search.best_estimator_

# VIF analysis

In [None]:
# VIF analysis
from statsmodels.stats.outliers_influence  import  variance_inflation_factor 
def get_VIF(X_train):
 # A dataframe that will contain the names of all the feature variables and their respective VIFs 
    vif = pd.DataFrame() 
    vif['Features'] = X_train.columns 
    vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])] 
    #vif['VIF'] = (vif['VIF'], 2) 
    vif = vif.sort_values(by = "VIF", ascending = False) 
    return (vif)

vif_ = pd.DataFrame(get_VIF( )) # put training data here
vif_