In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load the train and test set with the engineered variables

# we built and saved these datasets in the previous lecture.
# If you haven't done so, go ahead and check the previous notebook
# to find out how to create these datasets

X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

X_train.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0.16,0.555556,0.16,0.785714,0.498514,0.5,1.0,0.0,0.46748,0.503322,0.288218,0.0
1,0.44,0.666667,0.44,0.357143,0.307159,0.5,0.666667,0.0,0.365854,0.336379,0.470302,0.0
2,0.48,0.444444,0.48,0.5,0.530408,0.5,0.666667,1.0,0.311484,0.335548,0.630964,0.0
3,0.08,0.333333,0.08,0.714286,0.722864,0.0,0.666667,0.0,0.457825,0.243355,0.288705,0.0
4,0.48,0.444444,0.48,0.785714,0.314858,0.5,1.0,0.0,0.494919,0.668605,0.634859,0.0


In [3]:
# load the target (remember that the target is log transformed)
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head()

Unnamed: 0,selling_price
0,11.95118
1,13.07107
2,13.091904
3,12.310433
4,12.560244


In [4]:
# We will do the model fitting and feature selection
# altogether in a few lines of code

# first, we specify the Lasso Regression model, and we
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

# remember to set the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

# train Lasso model and select features
sel_.fit(X_train, y_train)

0,1,2
,"estimator  estimator: object The base estimator from which the transformer is built. This can be both a fitted (if ``prefit`` is set to True) or a non-fitted estimator. The estimator should have a ``feature_importances_`` or ``coef_`` attribute after fitting. Otherwise, the ``importance_getter`` parameter should be used.",Lasso(alpha=0...andom_state=0)
,"threshold  threshold: str or float, default=None The threshold value to use for feature selection. Features whose absolute importance value is greater or equal are kept while the others are discarded. If ""median"" (resp. ""mean""), then the ``threshold`` value is the median (resp. the mean) of the feature importances. A scaling factor (e.g., ""1.25*mean"") may also be used. If None and if the estimator has a parameter penalty set to l1, either explicitly or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, ""mean"" is used by default.",
,"prefit  prefit: bool, default=False Whether a prefit model is expected to be passed into the constructor directly or not. If `True`, `estimator` must be a fitted estimator. If `False`, `estimator` is fitted and updated by calling `fit` and `partial_fit`, respectively.",False
,"norm_order  norm_order: non-zero int, inf, -inf, default=1 Order of the norm used to filter the vectors of coefficients below ``threshold`` in the case where the ``coef_`` attribute of the estimator is of dimension 2.",1
,"max_features  max_features: int, callable, default=None The maximum number of features to select. - If an integer, then it specifies the maximum number of features to  allow. - If a callable, then it specifies how to calculate the maximum number of  features allowed. The callable will receive `X` as input: `max_features(X)`. - If `None`, then all features are kept. To only select based on ``max_features``, set ``threshold=-np.inf``. .. versionadded:: 0.20 .. versionchanged:: 1.1  `max_features` accepts a callable.",
,"importance_getter  importance_getter: str or callable, default='auto' If 'auto', uses the feature importance either through a ``coef_`` attribute or ``feature_importances_`` attribute of estimator. Also accepts a string that specifies an attribute name/path for extracting feature importance (implemented with `attrgetter`). For example, give `regressor_.coef_` in case of :class:`~sklearn.compose.TransformedTargetRegressor` or `named_steps.clf.feature_importances_` in case of :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. If `callable`, overrides the default feature importance getter. The callable is passed with the fitted estimator and it should return importance for each feature. .. versionadded:: 0.24",'auto'

0,1,2
,"alpha  alpha: float, default=1.0 Constant that multiplies the L1 term, controlling regularization strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. When `alpha = 0`, the objective is equivalent to ordinary least squares, solved by the :class:`LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Lasso` object is not advised. Instead, you should use the :class:`LinearRegression` object.",0.001
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"precompute  precompute: bool or array-like of shape (n_features, n_features), default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity.",False
,"copy_X  copy_X: bool, default=True If ``True``, X will be copied; else, it may be overwritten.",True
,"max_iter  max_iter: int, default=1000 The maximum number of iterations.",1000
,"tol  tol: float, default=1e-4 The tolerance for the optimization: if the updates are smaller or equal to ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller or equal to ``tol``, see Notes below.",0.0001
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.",False
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive.",False
,"random_state  random_state: int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",0
,"selection  selection: {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4.",'cyclic'


In [5]:
sel_.get_support().sum()


10

In [6]:
# let's visualise those features that were selected.
# (selected features marked with True)

sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False])

In [7]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 12
selected features: 10
features with coefficients shrank to zero: 2


In [8]:
# print the selected features
selected_feats

Index(['car_name', 'brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type', 'engine', 'max_power'],
      dtype='object')

In [9]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)

### End of Feature Selection

In [12]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB 325.1 kB/s eta 0:03:42
   ---------------------------------------- 0.1/72.0 MB 544.7 kB/s eta 0:02:13
   ---------------------------------------- 0.3/72.0 MB 2.0 MB/s eta 0:00:37
   ---------------------------------------- 0.8/72.0 MB 3.7 MB/s eta 0:00:20
    --------------------------------------- 1.1/72.0 MB 4.3 MB/s eta 0:00:17
    --------------------------------------- 1.5/72.0 MB 5.2 MB/s eta 0:00:14
   - -------------------------------------- 1.9/72.0 MB 5.4 MB/s eta 0:00:13
   - -------------------------------------- 2.3/72.0 MB 5.8 MB/s eta 0:00:12
   - ----------------------

In [13]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split

# Assume X and y are your features and target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Initialize XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# 2. Define hyperparameter distributions
param_distributions = {
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.15),
    'n_estimators': randint(100, 1000),
    'subsample': uniform(0.6, 0.4), # uniform(loc, scale) where loc is start, scale is range
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': [0, 0.5, 1, 2, 5],
    'min_child_weight': [1, 5, 10]
}

# 3. Initialize RandomizedSearchCV
# n_iter: Number of parameter settings that are sampled
# cv: Number of cross-validation folds
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Adjust the number of iterations as needed
    scoring='roc_auc', # Choose an appropriate scoring metric
    cv=3,
    verbose=1,
    n_jobs=-1, # Use all available cores
    random_state=42
)

# 4. Fit the model
# Replace X_train and y_train with your actual training data
# random_search.fit(X_train, y_train)

# 5. Get the best hyperparameters
# print("Best hyperparameters found: ", random_search.best_params_)
# print("Best AUC score found: ", random_search.best_score_)


In [15]:
# 4. Fit the model
#Replace X_train and y_train with your actual training data
random_search.fit(X_train, y_train)

# 5. Get the best hyperparameters
print("Best hyperparameters found: ", random_search.best_params_)
print("Best AUC score found: ", random_search.best_score_)


ValueError: Found input variables with inconsistent numbers of samples: [13741, 13869]

In [16]:
X_train.shape

(13741, 12)

In [17]:
y_train.shape

(13869, 1)