<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/18_Okt_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('train.csv').drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'])
df.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25


In [4]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

In [6]:
X = df.drop(columns=['Survived'])
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = RandomForestClassifier(n_estimators=100,
                               max_depth=5,
                               random_state=1)

roc_cros = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

model.fit(X, y)

roc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
roc_train = roc_auc_score(y, model.predict_proba(X)[:, 1])

print("ROC AUC Test: ", roc_test)
print("ROC AUC Train: ", roc_train)
print("ROC AUC Cross: ", roc_cros.mean())

ROC AUC Test:  0.8969371930731456
ROC AUC Train:  0.9098760106094015
ROC AUC Cross:  0.8669429981079716


In [20]:
param_grid = {
    'random_state': [215, 225, 220]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'random_state': 225}
Best score: 0.8572959383246642


In [24]:
param_grid = {
        'n_estimators': [50, 90, 100, 110, 200, 300]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters: {'n_estimators': 100}
Best score: 0.8572959383246642


In [28]:
param_grid = {
        'max_depth': [None,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 13 candidates, totalling 65 fits
Best parameters: {'max_depth': 5}
Best score: 0.866403568224111


In [32]:
param_grid = {
      'min_samples_split': [2, 5, 10],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'min_samples_split': 2}
Best score: 0.866403568224111


In [33]:
param_grid = {
    'min_samples_leaf': [1, 2, 4],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'min_samples_leaf': 1}
Best score: 0.866403568224111


In [34]:
param_grid = {
    'bootstrap': [True, False]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'bootstrap': True}
Best score: 0.866403568224111


In [35]:
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            bootstrap=True)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'criterion': 'entropy'}
Best score: 0.8681211673301454


In [43]:
param_grid = {
    'min_impurity_decrease': [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.01, 0.02]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            bootstrap=True,
                            criterion='entropy')

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'min_impurity_decrease': 0.005}
Best score: 0.8689202015919208


In [44]:
param_grid = {
    'max_features': ['sqrt', 'log2']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            bootstrap=True,
                            criterion='entropy',
                            max_features='sqrt')

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'max_features': 'sqrt'}
Best score: 0.8681211673301454


In [46]:
param_grid = {
    'class_weight': ['balanced', 'balanced_subsample']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            bootstrap=True,
                            criterion='entropy',
                            min_impurity_decrease=0.005,
                            max_features='sqrt')

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='roc_auc', cv=skf,
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'class_weight': 'balanced'}
Best score: 0.8702343598874762


In [48]:
param_grid = {
    'class_weight': ['balanced', 'balanced_subsample']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=225,
                            n_estimators=100,
                            max_depth=5,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            bootstrap=True,
                            criterion='entropy',
                            min_impurity_decrease=0.005,
                            max_features='sqrt',
                            class_weight='balanced')



roc_cros = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')

rf.fit(X, y)

roc_test = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
roc_train = roc_auc_score(y, rf.predict_proba(X)[:, 1])

print("ROC AUC Test: ", roc_test)
print("ROC AUC Train: ", roc_train)
print("ROC AUC Cross: ", roc_cros.mean())

ROC AUC Test:  0.8847893512535538
ROC AUC Train:  0.8997299715591346
ROC AUC Cross:  0.8635592266786688
