In [1]:
from sklearn.metrics import classification_report, plot_roc_curve

def print_roc(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    plot_roc_curve(clf, X_test, y_test)
    plt.plot([(0,0),(1,1)], '--y')
    plt.title('ROC curve')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.show()

In [None]:
import itertools
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
def plot_confusion_matrix(model, X_test, y_test, normalize=False, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    y_predict = model.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title = 'Normalized confusion matrix'
    else:
        title = 'Confusion matrix, without normalization'
    classes = np.arange(len(model.classes_))
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    np.set_printoptions(precision=2)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
def data_split(data, features, target, split_type='random', test_size=0.2):
    """
    This function split a df to X_train, X_test, y_train and y_train from a given database, 
    features list and target feature
    """
    split_types_avilable = ['random']
    if split_type not in split_types_avilable:
        raise ValueError(f"split_type is not within the scop of the function can be on of{split_types_avilable}")
    X = data[feature]
    y = data[target]
    if split_type = 'random':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random=42)
    
    
    return X_train, X_test, y_train, y_test

In [None]:
# categorical feature selection by percentile

from sklearn.feature_selection import SelectKBest, chi2

k=15
original_features = set(X_train.columns)
fs = SelectKBest(score_func=chi2, k=k)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

if k != 'all':
    dropped_features = list(X_train.iloc[:,fs.scores_.argsort()[k:]].columns)
else:
    dropped_features = []
print(f'The features that were dropped with the Chi-square method were: {dropped_features}')


In [None]:
# categorical feature selection by percentile

from sklearn.feature_selection import SelectPercentile, chi2

percentile = 10
original_features = set(X_train.columns)
fs = SelectPercentile(score_func=chi2, percentile=percentile)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

dropped_features = list(X_train.iloc[:, fs.scores_ <= np.percentile(fs.scores_, 100-percentile)].columns)
kept_features = list(X_train.iloc[:, fs.scores_ > np.percentile(fs.scores_, 100-percentile)].columns)
print(f'The features that were dropped are: {dropped_features}')
print(f'The features that were kept are: {kept_features}')

In [None]:
# feature selection using DecisionTreeClassifier

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)
rfe.fit(X_train, y_train)
X_train_fs = rfe.transform(X_train)
X_test_fs = rfe.transform(X_test)

dropped_features = list(X_train.loc[:,~rfe.support_].columns)
kept_features = list(X_train.loc[:,rfe.support_].columns)

print(f'The features that were dropped are: {dropped_features}')
print(f'The features that were kept are: {kept_features}')

In [None]:
# simple Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# plotting RPC curce and metrics

from sklearn.metrics import classification_report, plot_roc_curve
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
print(classification_report(y_test, y_pred))
plot_roc_curve(clf, X_test, y_test)
plt.plot([(0,0),(1,1)], '--y')
plt.title('ROC curve')
plt.show()

In [None]:
# fitting and printing regression modles

from sklearn.metrics import mean_squared_error, r2_score

def train_print_metrics(reg, X_train, y_train, X_test, y_test):
    print(type(reg).__name__)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(f'mean squared error {mean_squared_error(y_test, y_pred)}')
    print(f'R squard score {r2_score(y_test, y_pred)}')

In [None]:
# Plotting precision and recall based on threshold
from sklearn.metrics import precision_recall_curve

y_probs = clf.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

def linear_inter(x1, y1, x2, y2, x):
    m = (y2-y1)/(x2-x1)
    y = y1 + (x2-x) * m
    return y


recall_ = 0.7
for index, rec in enumerate(recall):
    if rec < recall_:
        break
    
precision_ = linear_inter(recall[index-1], precision[index-1], recall[index],
             precision[index], recall_)
threshold_ = linear_inter(recall[index-1], thresholds[index-1], recall[index],
             thresholds[index], recall_)
print(f'The precision on (recall={recall_}) is {precision_}')
print(f'The threshold on (recall={recall_}) is {threshold_}')

plt.plot(thresholds, precision[:-1], label='precision')
plt.plot(thresholds, recall[:-1], label='recall')
plt.xlabel('threshold')
plt.legend()
plt.show()

In [None]:
# cross validation k-Fold

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

reg = LogisticRegression()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, val_index in kf.split(X_train):
    X_train2, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train2, val = y[train_index], y[val_index]
    
final_score = cross_val_score(reg, X_train, y_train, cv= kf, scoring="accuracy")
print(f'Scores for each fold: {final_score}')
print('Final Model Score: %.2f' %(final_score.mean()))

In [None]:
# grid search CV

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

cls = LogisticRegression()

params_dict = {'penalty': ['l1', 'l2', 'elasticnet'],
               'tol': [10**-5, 10**-4, 10**-3],
               'C': [1.5, 1, 0.7],
               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

print(f'{type(reg).__name__} Tuning hyper-parameters with grid')

ss_cv = ShuffleSplit(n_splits=5)

clf_forest = GridSearchCV(cls, params_dict, cv = ss_cv, verbose=10, n_jobs=-1)

clf_forest.fit(X_train, y_train)

print("Best parameters set found on validation set:")
print(clf_forest.best_params_, '\n')

In [None]:
# Random search CV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit

reg = LogisticRegression()

params_dict = {'penalty': ['l1', 'l2', 'elasticnet'],
               'tol': [10**-6, 10**-5, 10**-4, 10**-3],
               'C': [2, 1.5, 1, 0.7],
               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

print(f'{type(reg).__name__} Tuning hyper-parameters with grid')

ss_cv = ShuffleSplit(n_splits=5)

clf_forest = RandomizedSearchCV(reg, params_dict, random_state=42, 
                                cv = ss_cv, verbose=10, n_iter=200, n_jobs=-1)

clf_forest.fit(X_train, y_train)

print("Best parameters set found on validation set:")
print(clf_forest.best_params_, '\n')

In [None]:
# Dimensionlity Reduction with PCA

from sklearn.decomposition import PCA

def plot_PCA_curve(X_train):
    pca = PCA()
    pca.fit(X_train)
    plt.figure(figsize=(7,7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.xlabel('cumulative explained variance')


def get_n_pca_components(n, X_train, X_test):
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train_t = pca.transform(X_train)
    X_test_t = pca.transform(X_test)
    return X_train_t, X_test_t