# Day 5

Random Forest

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.2f}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


Accuracy: 0.96
Confusion Matrix:
 [[40  3]
 [ 1 70]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [2]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42 ,stratify=y)


rf = RandomForestClassifier(n_estimators= 100 , random_state=42)
rf.fit(X,y)

y_pred = rf.predict(X_test)

acc = accuracy_score(y_test , y_pred)
cm  = confusion_matrix(y_test , y_pred)
report = classification_report(y_test , y_pred)

print(f"Accuracy Score : {acc :.2f}")
print("Confusion Matrix : \n",cm)
print("Classification Report: \n",report )

Accuracy Score : 1.00
Confusion Matrix : 
 [[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Gradient Boosting

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

# Train Gradient Boosting model
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

# Predict
y_pred = gb.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.2f}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


Accuracy: 1.00
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X = data.data
y = data.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

# Train Gradient Boosting model
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

gb.fit(X_train, y_train)

# Predict
y_pred = gb.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.2f}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


Accuracy: 0.96
Confusion Matrix:
 [[40  3]
 [ 2 69]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = data.data
y = data.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

models = {
    "Model A": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "Model B": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    "Model C": GradientBoostingClassifier(n_estimators=300, learning_rate=0.01, max_depth=5, random_state=42),
}

for model_name in models:
    print(f"\n===== {model_name} =====")
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test , y_pred)           # ✅ use y_test
    cm = confusion_matrix(y_test , y_pred)
    report = classification_report(y_test , y_pred)


    print(f"Accuracy score: {acc:.2f}")
    print("Confusion Matrix :",cm)
    print("Classification Report :",report)


===== Model A =====
Accuracy score: 0.96
Confusion Matrix : [[40  3]
 [ 2 69]]
Classification Report :               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


===== Model B =====
Accuracy score: 0.96
Confusion Matrix : [[40  3]
 [ 2 69]]
Classification Report :               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


===== Model C =====
Accuracy score: 0.95
Confusion Matrix : [[40  3]
 [ 3 68]]
Classification Report :               precision    recall 

XGBoost (Extreme Gradient Boost)

In [15]:
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data
data = load_breast_cancer()
X, y = data.data, data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
model = XGBClassifier(objective='reg:logistic',n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
print("result:",y_pred)
# Metrics
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.2f}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


result: [1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0
 1 1 0]
Accuracy: 0.96
Confusion Matrix:
 [[40  3]
 [ 2 69]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [20]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X = data.data
y = data.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state =42)

models = {
    "Model A": XGBClassifier(objective='binary:logistic',n_estimators = 100,learning_rate= 0.05,max_depth = 3,eval_metric = 'logloss',random_state =42),
    "Model B": XGBClassifier(objective='binary:logistic',n_estimators = 200,learning_rate= 0.5,max_depth = 4,eval_metric = 'error',random_state =42),
    "Model C": XGBClassifier(objective='binary:logistic',n_estimators = 300,learning_rate= 0.1,max_depth = 5,eval_metric = 'auc',random_state =42),
}

for model_name in models:
    print(f"\n===== {model_name} =====")
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test,y_pred)
    cm = confusion_matrix(y_test,y_pred)
    report = classification_report(y_test,y_pred)

    print(f"Accuracy Score : {acc:.2f}")
    print("Confusion Matrix:",cm)
    print("Classification Report :",report)


===== Model A =====
Accuracy Score : 0.96
Confusion Matrix: [[40  3]
 [ 2 69]]
Classification Report :               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


===== Model B =====
Accuracy Score : 0.96
Confusion Matrix: [[41  2]
 [ 2 69]]
Classification Report :               precision    recall  f1-score   support

           0       0.95      0.95      0.95        43
           1       0.97      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114


===== Model C =====
Accuracy Score : 0.96
Confusion Matrix: [[40  3]
 [ 2 69]]
Classification Report :               precision    recall 

In [24]:
from sklearn.datasets import load_iris
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

data = load_iris()
X,y = data.data , data.target

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

models={
    "Model A" : XGBClassifier(objective='multi:softmax',n_estimators=100,learning_rate=0.1,eval_metric='mlogloss',random_state=42,max_depth=3),
    "Model B" : XGBClassifier(objective='multi:softprob',n_estimators=200,learning_rate=0.05,eval_metric='merror',random_state=42,max_depth=4)
}

for model_name in models:
    print(f"===={model_name}=====\n")
    model=models[model_name]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("Results:",y_pred)

    acc = accuracy_score(y_test,y_pred)
    cm = confusion_matrix(y_test,y_pred)
    report = classification_report(y_test,y_pred)

    print(f"Accuracy score : {acc:.2f}")
    print("Confusion MAtrix :",cm)
    print("Classification Report :",report)

====Model A=====

Results: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy score : 1.00
Confusion MAtrix : [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

====Model B=====

Results: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy score : 1.00
Confusion MAtrix : [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy       

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# Load Titanic dataset from seaborn or local CSV
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Preprocessing
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Convert categorical columns
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# Drop unnecessary columns
df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

# Features and Target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Parameter grid
params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Step 3: Base model
xgb = XGBClassifier( eval_metric='logloss', random_state=42)

# Step 4: RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=params,
    n_iter=10,  # Try 10 combinations
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Step 5: Fit & Evaluate
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\n🔥 Best Parameters:", random_search.best_params_)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits

🔥 Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

✅ Accuracy: 0.8156424581005587

🧾 Confusion Matrix:
 [[96  9]
 [24 50]]

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       105
           1       0.85      0.68      0.75        74

    accuracy                           0.82       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



In [4]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import pandas as pd


# Load Titanic dataset from seaborn or local CSV
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

#preprocessing
df['Age']=df['Age'].fillna(df['Age'].mean())
df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])

#drop unnecessary columns
df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'],axis=1,inplace=True)

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])


#Feature & Target
X = df.drop('Survived',axis=1)
y = df['Survived']

#split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#parameter grids
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

#Basic Model
xgbb = XGBClassifier(objective='binary:logistic', random_state=42)

#RandomizedCV
random_search = RandomizedSearchCV(
    estimator =xgbb,
    param_distributions=param_grid,
    n_iter = 20,
    scoring= 'accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=1
)

#training the Model
random_search.fit(X_train, y_train)

best_model= random_search.best_estimator_
y_pred= best_model.predict(X_test)

print("\nBest Parameters :",random_search.best_params_)
print("\nAccuracy Score:",accuracy_score(y_test,y_pred))
print("\nConfusion MAtrix:",confusion_matrix(y_test,y_pred))
print("\nClassification Report : ",classification_report(y_test,y_pred))



Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best Parameters : {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

Accuracy Score: 0.7988826815642458

Confusion MAtrix: [[91 14]
 [22 52]]

Classification Report :                precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

