In [1]:
# 20-11-2023, Oren - Classification Project Notebook
# Re-Starting using existing sklearn pre-processing classes

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from pandas import read_csv
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, \
    AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore")
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, recall_score, precision_score, f1_score
from xgboost import XGBClassifier

def csv_drive_path_generatoer(url):
 '''
 Help in read csv file directly from google drive.
 Make sure the csv format is standard.
 url:str - path to csv file example:
   url = 'https://drive.google.com/file/d/126JPZ3lYwdLyJ2d_7jxM9jMtZaOlF-Ld/view?usp=sharing'
 return : str
 '''
 path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
 return path

In [3]:
link_X_train = 'https://drive.google.com/file/d/11Dgctv-N6z3ugQOpRrFNKVP5NNqPPWxK/view?usp=drive_link'
link_y_train = 'https://drive.google.com/file/d/1NDF2aCymR4zjYK9mLYNRwIdB_EyE5ZHl/view?usp=drive_link'

link_X_test = 'https://drive.google.com/file/d/1ZRg80tYdBO1pcd_6R1m_tiijZSfDXMZ6/view?usp=drive_link'

path_X_train = csv_drive_path_generatoer(link_X_train)
path_y_train = csv_drive_path_generatoer(link_y_train)

path_X_test = csv_drive_path_generatoer(link_X_test)

X_train = read_csv(path_X_train, index_col = 0)
y_train = read_csv(path_y_train, index_col = 0)

X_test = read_csv(path_X_test, index_col = 0)

# Drop the extra 'id' column
X_train = X_train.drop(['id'], axis = 1)
X_test = X_test.drop(['id'], axis = 1)

# Dropping null values
train_data = X_train.join(y_train).dropna()
X_test = X_test.dropna()

# X_train, y_train:
X_train = train_data.drop(['LeaveOrNot'], axis = 1)
y_train = train_data['LeaveOrNot']

# y_test_dummy = pd.DataFrame(data = np.zeros(len(X_test.index)), index = X_test.index, dtype = 'int64', columns = ['y_test_dummy'])

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3720 entries, 1847 to 2575
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Education                  3720 non-null   object 
 1   JoiningYear                3720 non-null   int64  
 2   City                       3720 non-null   object 
 3   PaymentTier                3720 non-null   int64  
 4   Age                        3720 non-null   float64
 5   Gender                     3720 non-null   object 
 6   EverBenched                3720 non-null   object 
 7   ExperienceInCurrentDomain  3720 non-null   int64  
 8   Race                       3720 non-null   object 
 9   LeaveOrNot                 3720 non-null   int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 319.7+ KB


In [5]:
# Defines custom transformer for feature engineering 

class FeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y = None):
        self.y = y
        return self

    def transform(self, X, column = 'JoiningYear', new_feature = 'Tenure', current_year = 2023, index = 'id'):
        # Check if 'JoiningYear' is present in the input data
        if column not in X.columns:
            raise ValueError(f"Column {column} not found in input data.")
        
        X[new_feature] = current_year - X[column]
        
        X = X.drop([column], axis = 1)
        return X
    
# Defines a custom transformer to drop rows with null values based on a column_name

class RemoveNullRowsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, null_val_columns = ['Age']):
        self.null_val_columns = null_val_columns
        # self.index = index

    def fit(self, X, y = None):
        self.y = y
        return self

    def transform(self, X):
    # joining X and y before dropping rows with null values
        if self.y is not None:
            y = self.y
            # X = X.set_index(self.index)
            # y = y.set_index(self.index)
            X = X.join(y)
            
        return X.dropna(subset= self.null_val_columns)

In [6]:
# fe = FeatureEngineerAndNullsDropper()
# # print(X_train.info())
# fe.fit_transform(X_train.join(y_train)).info()

In [7]:
# Mastery code...
# define model
# model = LogisticRegression()
# model = DecisionTreeClassifier()

# Specify the order of categories for the 'Education' feature
Education_order = ['Bachelors', 'Masters', 'PHD']

# Specify the target columns per Transformer
null_val_columns = ['Age','Gender']
column = ['JoiningYear']
ordinal_category_features = ['Education']
non_binary_categorical_features = ['City', 'Race']
binary_categorical_features = ['Gender','EverBenched'] 

# Define different types of transformer classes
remove_null_rows_transformer = RemoveNullRowsTransformer()
feature_engineer_transformer = FeatureEngineer()
binary_categorical_transformer = OneHotEncoder(drop='if_binary')
non_binary_categorical_transformer = OneHotEncoder()
ordinal_transformer = OrdinalEncoder(categories=[Education_order])

# Define a null rows Transformer
remove_null_rows_transformer = RemoveNullRowsTransformer()

# Define a ColumnTransformer (for the rest of the pre-processing steps)
column_transformer = ColumnTransformer(
        transformers= [
            # ('remove_null_rows_transformer',remove_null_rows_transformer, null_val_columns),
        ('nulls_droper_engineer',feature_engineer_transformer, column),
        ('binary_cat', binary_categorical_transformer, binary_categorical_features),
        ('non_binary_cat', non_binary_categorical_transformer, non_binary_categorical_features),
        ('ordinal_cat', ordinal_transformer, ordinal_category_features)], 
        remainder='passthrough') # passthrough all columns not explicitly transformed

In [8]:
# Define a model
model = GradientBoostingClassifier()

# Create a pipeline
pipeline = Pipeline([
    ('null_remover', remove_null_rows_transformer),  # First transformer step
    ('column_transformer',column_transformer),  # Second transformer step
    ('model',model) 
])

# Fit the pipeline on the training data
trained_pipe = pipeline.fit(X_train, y_train)

# transform the test data
transformed_X_test = trained_pipe.transform(X_test)


AttributeError: This 'Pipeline' has no attribute 'transform'

In [9]:
dir(trained_pipe)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_is_fitted__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_can_fit_transform',
 '_can_inverse_transform',
 '_can_transform',
 '_check_feature_names',
 '_check_fit_params',
 '_check_n_features',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_iter',
 '_log_message',
 '_more_tags',
 '_parameter_constraints',
 '_replace_estimator',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_m

In [None]:
# GPT

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model (replace with your actual evaluation metric)
# Here, I'm using cross_val_score as an example
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_score = -cv_scores.mean()

print(f'Mean Cross-Validated MSE: {mean_cv_score}')

In [None]:
def plot_roc_curve(X_train, X_test, y_train, y_test, ensemble_classifier):
    # generate a no skill prediction (majority class)
    ns_probs = [0 for _ in range(len(y_test))]

    # clf = DecisionTreeClassifier()
    # model_a_bagging = BaggingClassifier(estimator = base_model, n_estimators= 100)

    # fitting the models
    ensemble_classifier.fit(X_train, y_train)

    # predict probabilities
    probas = ensemble_classifier.predict_proba(X_test)

    # keep probabilities for the positive outcome only
    probas = probas[:,1]

    # calculate rocs auc scores
    auc = roc_auc_score(y_test, probas)

    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs, pos_label = 1)
    fpr, tpr, _ = roc_curve(y_test, probas, pos_label = 1)

    # plot the roc curve for the model

    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label=f"{ensemble_classifier}, auc = {auc:.3f}")

    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    # show the legend
    plt.legend(fontsize = 8)


In [None]:
plot_roc_curve(X_train, X_test, y_train, y_test, BaggingClassifier(estimator = DecisionTreeClassifier(max_depth = 7), n_estimators = 100))

In [None]:
clf_adaboost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 7), n_estimators = 100,
                                  learning_rate=0.01)
clf_adaboost.fit(X_train, y_train)
print(f"DT ADA boosting classifier:\n \
    \ttrain accuracy: {clf_adaboost.score(X_train, y_train):.2f}\n \
    \ttest accuracy: {clf_adaboost.score(X_test, y_test):.2f}")

In [None]:
# pip install xgboost

In [None]:
clf_xgb = XGBClassifier()

In [None]:
clf_xgb.get_params()

In [None]:
# rfc = RandomForestClassifier()
# rfc_params = {'max_depth': [5,10,15,20,40,100], 'max_features': [2,3,4,5,6,7,8,9]}
# gs_rfc = GridSearchCV(rfc, rfc_params, cv = 5, scoring='accuracy')
# gs_rfc.fit(X_train, y_train)
# print(gs_rfc.best_params_)

In [None]:
# gs_rfc.best_score_

In [None]:
bagging_clf = BaggingClassifier(estimator = DecisionTreeClassifier(max_depth = 7))
bag_params = {'bootstrap': [True,False], 'bootstrap_features': [False,True],\
              'max_features': [6,7,8],
                'oob_score': [True,False],
                 'warm_start': [True]}


gs_bag = GridSearchCV(bagging_clf, bag_params, cv = 5, scoring = 'f1')
gs_bag.fit(X_train, y_train)
gs_bag.best_score_

# bagging_clf.get_params()
# bagging_params = {'max_depth': [5,10,15,20,40,100], 'max_features': [2,3,4,5,6,7,8,9]}
# gs_rfc = GridSearchCV(rfc, rfc_params, cv = 5, scoring='f1')
# gs_rfc.fit(X_train, y_train)
# print(gs_rfc.best_params_)

In [None]:
# gs_bag.best_params_

In [None]:
# BaggingClassifier(estimator = DecisionTreeClassifier(max_depth = 7), n_estimators = 100).get_params()

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# import scipy.stats as stats

# Define the hyperparameter distributions
param_dist = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.00001, 0.01),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = XGBClassifier()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from pandas import read_csv
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, \
    AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

f1s, precisions, recalls, accuracies, classifiers_as_str = [],[],[],[], []
features, i_s = [], []
# classifiers = [XGBClassifier(learning_rate = 0.009956053586134729, max_depth = 6, n_estimators = 76, subsample = 0.6838981373746974), AdaBoostClassifier,\
#             KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]

classifiers = [DecisionTreeClassifier(),KNeighborsClassifier(), LogisticRegression(),GradientBoostingClassifier(), RandomForestClassifier(),\
               AdaBoostClassifier(),XGBClassifier(learning_rate = 0.01, max_depth = 7, n_estimators = 100, subsample = 0.786)]
              
for classifier in classifiers:
    # for i in range(1,len(feature_imp)+1):
    #     X_train = X_train[feature_imp[:i]]
    #     X_train.columns = [feature_imp[:i]
    #     # X_test = X_test[feature_imp[:i]]
    #     print(X_train)
    #     # print(X_test)
        
    model = classifier
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_test_pred)
    precision = precision_score(y_test,y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    accuracy = accuracy_score(y_test, y_test_pred)
    f1s.append(f1.round(3))
    precisions.append(precision.round(3))
    recalls.append(recall.round(3))
    accuracies.append(accuracy.round(3))
    # i_s.append(i)
    classifiers_as_str.append(str(classifier).split('(')[0])

report = pd.DataFrame({'classifier': classifiers_as_str,'accuracy': accuracies, 'recall':recalls,'precision':precisions,'f1_score': f1s})
print(report.sort_values(by = 'f1_score', ascending=False))

In [None]:
# Inspect the XGBClassifer confusion matrix:
model = XGBClassifier()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
xgb_cm = pd.DataFrame(data = confusion_matrix(y_test, y_test_pred), index = target_names, columns = target_names)
print(xgb_cm)

In [None]:
# To do 17-11-2023 (Raz S):
# 1. cm of the different models
# 2. Actual vs. predicted probas
# 3. Look at features of false positive vs. true positive
# 4. Balance_classes (bool)
# 5. Class weights (Loss handling)