# Library imports

I highly recommend you get <a href="https://github.com/VolkiTheDreamer/PythonRocks/tree/master/mypyextj">my custom package</a>(they are not pypi-installable yet), just download the folder via https://minhaskamal.github.io/DownGit/#/home onto your local.(read the readme file)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mypyext import dataanalysis as da
from dataprep.eda import plot, plot_correlation, plot_missing, create_report
import sweetviz as sv

In [None]:
#preprocessors
from mypyext import ml
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict,StratifiedKFold,RepeatedKFold,RepeatedStratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer,KNNImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from category_encoders import OrdinalEncoder as COE
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold,SelectKBest, chi2, f_classif, mutual_info_classif,RFE,RFECV


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier, AdaBoostClassifier,BaggingClassifier,VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import auc,roc_auc_score,precision_recall_curve,roc_curve,brier_score_loss

# Reading data

In [None]:
df=pd.reaqd_csv("url")
df.head()

In [None]:
df.shape

# EDA

First of all, let's explore our dataset a little, let's see what's up and what's correlated with each other, etc.

## General

In [None]:
df.super_info_() #my extension method, information about which can be found at https://mvolkanyurtseven.medium.com/top-n-useful-python-tips-tricks-e3a163e56749

You can check <a href="https://mvolkanyurtseven.medium.com/top-n-useful-python-tips-tricks-e3a163e56749">this link</a> in order to learn how to use extension methods and what they are.

**Comments**

- check for datatypes
- comment on unnecessary columns
- insert line in the pipeline for scaling if normal distribution assumption is sustained
- insert line in the pipeline for discretizastion if necessary

In [None]:
#drop columns that are of 1-cardinality, full-cardinality, and others you find necessary
df.drop([],axis=1,inplace=True)

Let's see the unique values of features with low <a href="https://en.wikipedia.org/wiki/Cardinality">cardinality</a>

In [None]:
da.getColumnsInLowCardinality(df)

**Comments**

- comment on ordinals
- comment on numerics that are actually to be taken as categoric

Let's determine on our feature types.

In [None]:
target=[]
nums=[]
cats=list(df.columns).removeItemsFromList_(nums+target,False) #extension method
ords=[]
noms=cats.removeItemsFromList_(ords,False)

insert line in the pipeline for encoding(ordinal+onehot) operations

## Some Visuals

### Correlations

In [None]:
sns.pairplot(df[nums],height=1, aspect=1.2);

**Comments**

- diagonal histogram for skewness, multicollinearty?
- insert line in the pipeline for log transformation if necessary

Now let's look at the correlations closer. We will use the <a href="https://github.com/shakedzy/dython">dython</a> library for this. Because, with this library, both numeric-numeric, numeric-categorical and categorical-categorical correlations can be obtained with a single function.

In [None]:
from dython.nominal import associations
corrdict=associations(df,nominal_columns=cats,numerical_columns=nums,figsize=(10,10))

Let's see the top N features that correlate with target.

In [None]:
corr_results=corrdict["corr"] #dataframe
da.getHighestPairsOfCorrelation(corr_results,target?,N?)

We can use these correlation values in the feature selection stage. By the way, we said in the above charts that x,y etc. would be of high importance. Indeed, they are also highly correlated.

### Outliers

In [None]:
df[nums].plot(kind="box", subplots = True,figsize=(8,5))
plt.tight_layout();

In [None]:
plt.figure(figsize=(6,4))
for e,n in enumerate(nums):
    plt.subplot(1,len(nums),e+1) 
    ch=sns.violinplot(y=n, data=df)
plt.tight_layout()    
plt.show();    

In [None]:
da.outlierinfo(df,nums,imputestrategy="None",thresh=0.25)

In [None]:
da.outliers_IQR(df,nums,imputestrategy="None",thresh=0.25)

In [None]:
da.outliers_IQR(df,nums,imputestrategy="None",thresh=0.1)

**comments**

- .....
- insert line in the pipeline for outlier-handling

### Checking for Nulls

In [None]:
import missingno as msno
msno.bar(df, figsize=(8,5));

In [None]:
da.nullPlot(df)

Let's check for null-like values

In [30]:
da.findNullLikeValues(df)

There are no null-like values


**comments**

- .....
- insert line  in the pipeline for null-handling

## Target-based analysis

Let's take first a look at the number of instances in differenta values of the target.

In [None]:
sns.countplot(x=df["targetcolumn"]);

insert line in the pipeline for oversapling/undersampling in case of imbalance

Let's now take a look at the average values of numeric features on the basis of target.

In [None]:
da.plotNumericsByTarget(df,"target",nums=nums,layout=(1,5),figsize=(12, 4))

**Comments**

- Outliers can mislead the interpretation of the results, if any.
- feature importance?

For the categoric features, lets check the probabilities of each value of the feature on the positive target.??????????

In [None]:
plt.figure(figsize=(14,8))
da.plotTargetByCats(df, cats, "target", subplot_tpl=(2,4));

**Comments**

- ....

Lastly, lets look at the distribution of the positive target on the categoric features.

In [None]:
plt.figure(figsize=(14,8))
da.plotPositiveTargetByCats(df, cats, "target", subplot_tpl=(2,4));

**Comments**

Most of the "positive class" are those with:

- ...
- ...

In [None]:
# da.plotTargetForNumCatsPairs(dfheart2,nums,cats,"AHD",2.4,0.9)
# da.plotCategoricForNumTargetPairs(dfheart2,nums,cats,"AHD",2.4,0.9)

## Checking for cleaning

In [None]:
#numeric borders, check the min-max
df.describe()

check unique values in low-cardinalty features again to see if there are any abnormal values that're not supposed to be there

## Others

In [None]:
#duplicate check for rows
len(df)-len(df.duplicated(keep=False))

In [None]:
#duplicate check for columns
len(set(df.columns))-len(df.columns)

In [None]:
#multicollinearty check->remove one if r>0.9


In [None]:
#data quality to peculiar to this specific data


In [None]:
# any need for conversion, 100 USD-->100


In [None]:
#check for feature extraction


insert line in the pipeline for necessary processes

# Checking for assumptions

Whatever algorithm you will use, check its assumptions.

# Preparing X,y and train-test splits

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1].values.reshape(-1,1)

In [None]:
#if imbalanced add stratify=y
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

In [None]:
list(map(np.shape, (X_train, X_test, y_train, y_test)))

In [None]:
list(map(type, (X_train, X_test, y_train, y_test)))

# Modelling

## Model Selection

If necessary, crete your custom function and classes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

def mycustomFunc(X):
    return X #or X.values is to be passed dataframe
    
class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, featureindices): #if only specific columns to be processed
        self.featureindices = featureindices
  
    def fit(self, X:np.array, y = None):
        Q1s = np.quantile(X[:,self.featureindices],0.25,axis=0)
        Q3s = np.quantile(X[:,self.featureindices],0.75,axis=0)
        IQRs = Q3s-Q1s
        self.top=(Q3s + 1.5 * IQRs)
        self.bottom=(Q1s - 1.5 * IQRs)
        return self 
    
    def transform(self, X:np.array, y = None ):
        X[:,self.featureindices]=np.where(X[:,self.featureindices]>self.top,self.top,X[:,self.featureindices])
        X[:,self.featureindices]=np.where(X[:,self.featureindices]<self.bottom,self.bottom,X[:,self.featureindices])
        return X

Let's build our pipeline, this is only a template, we need to enhance it. First we can only try out numeric-ranged parametres in a very large space, and then narrow it on the second run.

In [None]:
c_range=np.logspace(4,-5, num=10)
weights = np.arange(2,11,2)
min_res=10
fact=3

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV,RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

np.random.seed(42)

class DummyTransformer(TransformerMixin,BaseEstimator):
    def fit(self,X,y=None): pass
    def transform(self,X,y=None): pass
    
class DummyEstimator(BaseEstimator):
    def fit(self,X,y=None): pass
    def score(self,X,y=None): pass    
    
cat_pipe=Pipeline([ 
                   ("csi", SimpleImputer(strategy="most_frequent")),
                   ("ohe", OneHotEncoder(drop="first",handle_unknown='ignore'))     
                  ])

num_pipe=Pipeline([  
                   ("nsi", FunctionTransformer(numericImputer)), 
                   ("ouh", DummyTransformer()), 
                   ("scl", DummyTransformer())
                  ])

coltrans = ColumnTransformer([
                                ('nominals',  cat_pipe, noms),
                                ('ordinals',  OrdinalEncoder(categories=[[]]), ["ChestPain"]),
                                ('numerics',  num_pipe, nums)
                                 ],n_jobs=-1,remainder="passthrough")

pipe = Pipeline(steps=[('ct', coltrans),
                       ('fs', SelectKBest(score_func=mutual_info_classif,k=10)), 
                       ('clf', DummyEstimator()) 
                       ])

params = [          
          {
           'clf'         : [LogisticRegression(max_iter=1000,random_state=42)],
           'clf__C'      : c_range, 
           'clf__penalty': ['l2'], 
           'clf__solver' : ['newton-cg', 'lbfgs'],
           'clf__class_weight': [{"No":1, "Yes":x} for x in weights] + ['balanced']                
           'ct__numerics__ouh': [OutlierHandler(featureindices=[1]),None],
           'ct__numerics__scl': [StandardScaler(),RobustScaler()]
          } ,
    
          {
           'clf'         : [DecisionTreeClassifier(random_state=42)],
           'clf__criterion': ['gini', 'entropy'],
           'clf__max_depth': [2,3],
           'clf__min_samples_split':[2,4],
           'clf__class_weight': [{"No":1, "Yes":x} for x in weights] + ['balanced']                
           'ct__numerics__ouh': [None],
           'ct__numerics__scl': [None]
          }  
         ]         

mycv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
hrs1 = HalvingRandomSearchCV(estimator = pipe, param_distributions = params, cv = mycv, n_jobs=-1, verbose = 1, 
                           scoring = 'accuracy',error_score='raise',min_resources=min_res,factor=fact) 

hrs1.fit(X_train, y_train)

In [None]:
#new c_range
c_range=[]

params = [          
          {
           'clf'         : [LogisticRegression(max_iter=1000,random_state=42)],
           'clf__C'      : c_range2, 
           'clf__penalty': ['l2'], 
           'clf__solver' : ['newton-cg', 'lbfgs'],
           'clf__class_weight': [{"No":1, "Yes":x} for x in weights] + ['balanced']              
           'ct__numerics__ouh': [OutlierHandler(featureindices=[1]),None],
           'ct__numerics__scl': [StandardScaler(),RobustScaler()]
          } ,
    
          {
           'clf'         : [DecisionTreeClassifier(random_state=42)],
           'clf__criterion': ['gini', 'entropy'],
           'clf__max_depth': [2,3],
           'clf__min_samples_split':[2,4],
           'clf__class_weight': [{"No":1, "Yes":x} for x in weights] + ['balanced'] 
           'ct__numerics__ouh': [None],
           'ct__numerics__scl': [None]
          }  
         ]    

hrs = HalvingRandomSearchCV(estimator = pipe, param_distributions = params, cv = mycv, n_jobs=-1, verbose = 1, 
                           scoring = 'accuracy',error_score='raise',min_resources=60,factor=3) 

hrs.fit(X_train, y_train)

In [None]:
ml.gridsearch_to_df(hrs2)    

In [None]:
ml.compareEstimatorsInGridSearch(gs4,tableorplot='table')

In [None]:
ml.compareEstimatorsInGridSearch(gs4,tableorplot='plot',figsize=(4,4))        

## Post-Check for assumptions

If your algorithms require post-check, do it

## Model Evaluation

In [None]:
ml.plot_learning_curve(best_estimator_,"Learnig curve",X_train,y_train,cv=mycv)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
ConfusionMatrixDisplay

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
ml.plot_confusion_matrix(cm,classes=[])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
#if not imbalanced
ml.plotROC(y_test, X_test, gs4, pos_label="Yes")

In [None]:
ml.plot_precision_recall_curve(y_test_le,X_test,gs4)

In [None]:
from sklearn.metrics import plot_precision_recall_curve

In [None]:
ml.plot_gain_and_lift(gs4,X_test,y_test,pos_label="Yes")

let's run the gridsearch with log loss optimization

In [None]:
hrs = HalvingRandomSearchCV(estimator = pipe, param_distributions = params, cv = mycv, n_jobs=-1, verbose = 1, 
                           scoring = 'neg_log_loss',error_score='raise',min_resources=60,factor=3) 

hrs.fit(X_train, y_train)

In [None]:
ml.gridsearch_to_df(hrs)

In [None]:
ml.find_best_cutoff_for_classification(hrs, y_test_le, X_test,[0,5,100,60])    

## Feature Importance

In [None]:
#if lenar model
ml.linear_model_feature_importance(gs4,coltrans,"fs","clf")

In [None]:
#if tree based
#ml.get_feature_names_from_columntransformer
ml.eatureImportanceEncoded(feature_importance_array,feature_names)

## Interpretability

# Deployment