In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as se
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,plot_confusion_matrix
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression,f_classif
from sklearn.preprocessing import RobustScaler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import PowerTransformer
from catboost import CatBoostClassifier


### Data Fetch
 Pandas is an open-source, BSD-licensed library providing high-performance,easy-to-use data manipulation and data analysis tools.

In [None]:
# Data Fetch
# file=''
df=pd.read_csv('encoded_df.csv')
df.head()


### Feature Selection
 It is the process of reducing the number of input variables when developing a predictive model.Used to reduce the number of input variables to reduce the computational cost of modelling and,in some cases,to improve the performance of the model.

In [None]:
# Selected Columns
features=['LIST OF FEATURES/COLUMN NAMES']
target='TARGET COLUMN NAME'
# X & Y
X=df[features]
Y=df[target]


### Data Encoding
 Converting the string classes data in the datasets by encoding them to integer either using OneHotEncoding or LabelEncoding

In [None]:
# Handling AlphaNumeric Features
X=pd.get_dummies(X)


### Correlation Matrix
 In order to check the correlation between the features, we will plot a correlation matrix. It is effective in summarizing a large amount of data where the goal is to see patterns.

In [None]:
f,ax = plt.subplots(figsize=(18, 18))
matrix = np.triu(X.corr())
se.heatmap(X.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax, mask=matrix)
plt.show()


### Multi-colinearity Test
 Dropping Highly Correlated Features to due similar features distributions


In [None]:
def dropHighCorrelationFeatures(X):
        cor_matrix = X.corr()
        upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
        if to_drop!=[]: return X.drop(to_drop, axis=1)
        else: return X
X=dropHighCorrelationFeatures(X)
X.head()

### Best Feature Selection
 selecting 'n' best feature on the basis of ANOVA or Univariate Linear Regression Test. where ANOVA is used for Classification problem and Univariate Linear Regression for Regression problems


In [None]:

def get_feature_importance(X,Y,score_func):
    fit = SelectKBest(score_func=score_func, k=X.shape[1]).fit(X,Y)
    dfscores,dfcolumns = pd.DataFrame(fit.scores_),pd.DataFrame(X.columns)
    df = pd.concat([dfcolumns,dfscores],axis=1)
    df.columns = ['features','Score'] 
    df['Score']=MinMaxScaler().fit_transform(np.array(df['Score']).reshape(-1,1))
    result=dict(df.values)
    val=dict(sorted(result.items(), key=lambda item: item[1],reverse=False))
    keylist=[]
    for key, value in val.items():
        if value < 0.01: keylist.append(key)
    X=X.drop(keylist,axis=1)
    plt.figure(figsize = (12, 6))
    plt.barh(range(len(val)), list(val.values()), align='center')
    plt.yticks(range(len(val)),list(val.keys()))
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()
    return X
X=get_feature_importance(X,Y,score_func=f_classif)
        

### Data Rescaling
 Feature scaling or Data scaling is a method used to normalize the range of independent variables or features of data. In data processing, it is also known as data normalization

In [None]:
columns=X.columns
X=RobustScaler().fit_transform(X)
X=pd.DataFrame(data = X,columns = columns)
X.head()


### Train & Test
 The train-test split is a procedure for evaluating the performance of an algorithm.The procedure involves taking a dataset and dividing it into two subsets.The first subset is utilized to fit/train the model.The second subset is used for prediction.The main motive is to estimate the performance of the model on new data.

In [None]:
# Data split for training and testing
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=123)


### Target Balancing
 SMOTEENN method combines the SMOTE ability to generate synthetic examples for minority class and ENN ability to delete some observations from both classes that are identified as having different class between the observation’s class and its K-nearest neighbor majority class.

In [None]:
# resampling target
resample=SMOTEENN()
X_train,Y_train=resample.fit_resample(X_train,Y_train)

### Feature Transformation
  Feature transformation is a mathematical transformation in which we apply a mathematical formula to data and transform the values which are useful for our further analysis.

In [None]:
powertransformer=PowerTransformer()
X_train=powertransformer.fit_transform(X_train)
X_test=powertransformer.transform(X_test)

### Model
            
CatBoost is an algorithm for gradient boosting on decision trees. Developed by Yandex researchers and engineers, it is the successor of the MatrixNet algorithm that is widely used within the company for ranking tasks, forecasting and making recommendations

#### Tuning parameters

1. **learning_rate**:, The learning rate. Used for reducing the gradient step.

2. **l2_leaf_reg**: Coefficient at the L2 regularization term of the cost function. Any positive value is allowed.

3. **bootstrap_type**: Bootstrap type. Defines the method for sampling the weights of objects.
    
4. **subsample**: Sample rate for bagging. This parameter can be used if one of the following bootstrap types is selected:

For more information refer: [API](https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier)

In [None]:
# Model Initialization
model=CatBoostClassifier()
model.fit(X_train,Y_train)


### Accuracy Metrics
 Performance metrics are a part of every machine learning pipeline. They tell you if you're making progress, and put a number on it. All machine learning models,whether it's linear regression, or a SOTA technique like BERT, need a metric to judge performance.

In [None]:
# Confusion Matrix
plot_confusion_matrix(model,X_test,Y_test,cmap=plt.cm.Blues)
# Classification Report
print(classification_report(Y_test,model.predict(X_test)))
