# Import Libraries and Data Set

In [37]:
import pandas as pd
import numpy as np
from cenfo.obliging import information, report_regression

In [38]:
df0 = pd.read_csv('../../Machine_Learning_Algorithms/C_Datasets/Co2_Emission.csv')
df0.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


# Preprocessing

- drop columns: 'MODELYEAR', 'MAKE', 'MODEL'
- Seperate TRANSMISSION to two different columns with by numbers and letters

### Import Libraries

In [39]:
def preprocessing(X):
    X['TRANSMISSION'] = X.TRANSMISSION.replace('AV', 'AV0')
    return (
                X
                .assign(TRANSMISSION_TYPE = X.TRANSMISSION.apply(lambda x: x[:-1]),
                        TRANSMISSION_NUM = X.TRANSMISSION.apply(lambda x: x[-1]).astype('int'))
                .drop(['MODELYEAR', 'MAKE', 'MODEL', 'TRANSMISSION'], axis=1)
            )

In [40]:
df = preprocessing(df0)

In [41]:
df.head()

Unnamed: 0,VEHICLECLASS,ENGINESIZE,CYLINDERS,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS,TRANSMISSION_TYPE,TRANSMISSION_NUM
0,COMPACT,2.0,4,Z,9.9,6.7,8.5,33,196,AS,5
1,COMPACT,2.4,4,Z,11.2,7.7,9.6,29,221,M,6
2,COMPACT,1.5,4,Z,6.0,5.8,5.9,48,136,AV,7
3,SUV - SMALL,3.5,6,Z,12.7,9.1,11.1,25,255,AS,6
4,SUV - SMALL,3.5,6,Z,12.1,8.7,10.6,27,244,AS,6


In [42]:
df.dtypes

VEHICLECLASS                 object
ENGINESIZE                  float64
CYLINDERS                     int64
FUELTYPE                     object
FUELCONSUMPTION_CITY        float64
FUELCONSUMPTION_HWY         float64
FUELCONSUMPTION_COMB        float64
FUELCONSUMPTION_COMB_MPG      int64
CO2EMISSIONS                  int64
TRANSMISSION_TYPE            object
TRANSMISSION_NUM              int32
dtype: object

# Split Data Set

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X = df.drop('CO2EMISSIONS', axis=1)
y = df[['CO2EMISSIONS']]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define Dtypes

In [46]:
numerical_cols = X.select_dtypes('number').columns
categorical_cols = X.select_dtypes('object').columns

In [47]:
from sklearn.ensemble import GradientBoostingClassifier as boosting

# Create Pipelines for ColumnTransformer

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [49]:
numerical_pipe = Pipeline([('scaler', StandardScaler())])
categorical_pipe = Pipeline([('encode', OneHotEncoder(handle_unknown='ignore'))])

# Create ColumnTransformer

In [50]:
from sklearn.compose import ColumnTransformer

In [51]:
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipe, numerical_cols),
    ('categorical', categorical_pipe, categorical_cols)
])

# Create Pipeline for Models

In [52]:
def pipe(model):
    return Pipeline([
                ('preprocessor', preprocessor),
                ('model', model)
            ])

In [None]:
def model_report(X):
    model = X
    pipe_model = pipe(model)
    pipe_model.fit(X_train, y_train)
    y_pred = pipe_model.predict(X_test)
    return report_regression(y_test, y_pred, model)

# Regression Models

## LinearRegression()

In [53]:
from sklearn.linear_model import LinearRegression as LinReg
from cenfo.obliging import report_regression

In [65]:
model_report(LinReg())

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,LinearRegression,3.404693,36.880378,6.072922,0.991109


## DecisionTreeRegression()

In [61]:
from sklearn.tree import DecisionTreeRegressor as tree

In [66]:
model_report(tree())

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,DecisionTreeRegressor,1.108614,25.7603,5.075461,0.99379


## RandomForestRegression()

In [67]:
from sklearn.ensemble import RandomForestRegressor as forest

In [68]:
model_report(forest())

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,RandomForestRegressor,1.715169,36.076767,6.006394,0.991303


## SVR()

In [69]:
from sklearn.svm import SVR

In [70]:
model_report(SVR())

  y = column_or_1d(y, warn=True)


Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,SVR,19.920307,1215.11441,34.858491,0.707058


## GradientBoostingRegression()

In [71]:
from sklearn.ensemble import GradientBoostingRegressor as boosting

In [72]:
model_report(boosting())

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,GradientBoostingRegressor,2.337043,13.38608,3.658699,0.996773


### Create a Pipeline

In [None]:
pipe_boosting = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('boosting', boosting())
])

### Create a Parameter Grid

In [None]:
param_grid = {
    'boosting__learning_rate': [0.001, 0.01, 0.1],
    'boosting__n_estimators': [10, 30, 50, 60, 100],
    'boosting__max_leaf_nodes': [5,  15, 25, 35, None],
    'boosting__ccp_alpha': [0.01, 0.1, 0.0, 1.0, 10.0],
    'boosting__max_depth': [2, 4, 6, 8, 10]
}

### Create and Fit the Grid Search

In [None]:
grid_search = GridSearchCV(pipe_boosting, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

### Show the Result of Grid Search

In [None]:
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.2f}')

### Apply Results to Model

In [None]:
# create a pipeline
pipe_boosting = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('boosting', boosting(ccp_alpha=0.0, learning_rate=0.1, max_depth=4, max_leaf_nodes=15, n_estimators=50))
])

# Fit the pipeline
pipe_boosting.fit(X_train, y_train)

# Make predictions
y_pred = pipe_boosting.predict(X_test)

# Show report
boosting_report = report_classifier(y_test, y_pred, 'boosting_model')
boosting_report

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
titanic = pd.read_csv('../../Machine_Learning_Algorithms/C_Datasets/titanic_train_original.csv')

# Separate features and target
X = titanic.drop(columns='Survived')
y = titanic['Survived']

# Identify numerical and categorical features
numeric_features = ['Age', 'Fare', 'Parch', 'SibSp']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create the full pipeline with RandomForestClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best parameters and estimator
best_model = grid_search.best_estimator_
print(f'Best parameters found: {grid_search.best_params_}')

# Evaluate the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy}')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin

# Load the dataset
titanic = pd.read_csv('../../Machine_Learning_Algorithms/C_Datasets/titanic_train_original.csv')

# Feature Engineering Functions
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

def extract_deck(cabin):
    return cabin[0] if pd.notna(cabin) else 'M'

# Feature Engineering
titanic['Title'] = titanic['Name'].apply(extract_title)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic['IsAlone'] = (titanic['FamilySize'] == 1).astype(int)
titanic['Deck'] = titanic['Cabin'].apply(extract_deck)

# Drop unnecessary columns
titanic.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# Separate features and target
X = titanic.drop(columns='Survived')
y = titanic['Survived']

# Identify numerical and categorical features
numeric_features = ['Age', 'Fare', 'Parch', 'SibSp', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck', 'IsAlone']

# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create the full pipeline with RandomForestClassifier and GradientBoostingClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the parameter grid for RandomForestClassifier
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Optionally, define the parameter grid for GradientBoostingClassifier
param_grid_gb = {
    'classifier': [GradientBoostingClassifier(random_state=42)],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Combine parameter grids
param_grid = [
    param_grid_rf,
    param_grid_gb
]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best parameters and estimator
best_model = grid_search.best_estimator_
print(f'Best parameters found: {grid_search.best_params_}')

# Evaluate the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy}')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# Load the dataset
titanic = pd.read_csv('../../Machine_Learning_Algorithms/C_Datasets/titanic_train_original.csv')

# Feature Engineering Functions
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

def extract_deck(cabin):
    return cabin[0] if pd.notna(cabin) else 'M'

# Feature Engineering
titanic['Title'] = titanic['Name'].apply(extract_title)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic['IsAlone'] = (titanic['FamilySize'] == 1).astype(int)
titanic['Deck'] = titanic['Cabin'].apply(extract_deck)

# Drop unnecessary columns
titanic.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# Separate features and target
X = titanic.drop(columns='Survived')
y = titanic['Survived']

# Identify numerical and categorical features
numeric_features = ['Age', 'Fare', 'Parch', 'SibSp', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck', 'IsAlone']

# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('pca', PCA(n_components=10))
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Define base models for stacking
estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# Create the full pipeline with StackingClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5))
])

# Define the parameter grid
param_grid = {
    'classifier__rf__n_estimators': [100, 200],
    'classifier__rf__max_depth': [None, 10],
    'classifier__gb__n_estimators': [100, 200],
    'classifier__gb__learning_rate': [0.01, 0.1]
}

# Split data with Stratified K-Fold Cross-Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best parameters and estimator
best_model = grid_search.best_estimator_
print(f'Best parameters found: {grid_search.best_params_}')

# Evaluate the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

print(f'Test accuracy: {accuracy}')
print(f'Test F1 score: {f1}')
print(f'Test ROC AUC score: {roc_auc}')
