In [54]:
pip install xgboost scikit-learn pandas numpy joblib


Note: you may need to restart the kernel to use updated packages.


    extract-msg (<=0.29.*)
                 ~~~~~~~^


In [55]:
import xgboost
print(xgboost.__version__)


2.1.4


In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [74]:
train_df = pd.read_csv("train.csv")
train_df.head()


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [75]:
# Preprocessing
# Separate features and target
X = train_df.drop(['id', 'Response'], axis=1)
y = train_df['Response']

# Identify column types
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessor
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [76]:
# Train-Test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [77]:
# Logistic Regression Pipeline and Evaluation
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_logreg = logreg_pipeline.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_logreg))
print(classification_report(y_val, y_pred_logreg))


Logistic Regression Accuracy: 0.8750623179659416
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.00      0.00      0.00      9523

    accuracy                           0.88     76222
   macro avg       0.44      0.50      0.47     76222
weighted avg       0.77      0.88      0.82     76222



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
# XGBoost Pipeline and Evaluation
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_val)

print("XGBoost Accuracy:", accuracy_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.8748130461021753
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.48      0.03      0.05      9523

    accuracy                           0.87     76222
   macro avg       0.68      0.51      0.49     76222
weighted avg       0.83      0.87      0.82     76222



In [79]:
joblib.dump(xgb_pipeline, "model.pkl")  # Save the full pipeline (preprocessing + model)
print("Full pipeline saved successfully.")


Full pipeline saved successfully.


In [80]:
# Load test data
test_df = pd.read_csv("test.csv")

# Define columns exactly as training
numerical_cols = ['Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
all_features = numerical_cols + categorical_cols

# Drop id if present
if 'id' in test_df.columns:
    ids = test_df['id']
    X_test = test_df.drop('id', axis=1)
else:
    ids = pd.Series(range(len(test_df)))
    X_test = test_df.copy()

# Ensure all columns are present (add missing with default if needed)
for col in all_features:
    if col not in X_test.columns:
        if col in numerical_cols:
            X_test[col] = 0
        else:
            X_test[col] = 'Unknown'

# Reorder columns to match training
X_test = X_test[all_features]

# Convert categorical columns to string dtype (important!)
for col in categorical_cols:
    X_test[col] = X_test[col].astype(str)

# Now load model and predict
import joblib
model = joblib.load("model.pkl")

predictions = model.predict(X_test)

print("Predictions done successfully.")


Predictions done successfully.


In [81]:
print(pd.Series(predicted_labels).value_counts())


Not Interested    126065
Interested           972
Name: count, dtype: int64


In [82]:
print(model.named_steps)  # This should include 'preprocessor' and 'classifier'


{'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['Age', 'Driving_License', 'Region_Code',
                                  'Previously_Insured', 'Annual_Premium',
                                  'Policy_Sales_Channel', 'Vintage']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Gender', 'Vehicle_Age', 'Vehicle_Damage'])]), 'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_de

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
import joblib

# Example preprocessing
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
numerical_cols = ['Age', 'Annual_Premium', 'Vintage']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Classifier
clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Train the model
pipeline.fit(X_train, y_train)

# Save full pipeline
joblib.dump(pipeline, 'model.pkl')


Parameters: { "use_label_encoder" } are not used.



['model.pkl']

In [84]:
X.columns

Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage'],
      dtype='object')

In [85]:
print(model)  # Should show Pipeline(steps=[('preprocessor', ...), ('classifier', ...)])
print(model.named_steps)  # Should have keys 'preprocessor' and 'classifier'


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'Driving_License',
                                                   'Region_Code',
                                                   'Previously_Insured',
                                                   'Annual_Premium',
                                                   'Policy_Sales_Channel',
                                                   'Vintage']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Gender', 'Vehicle_Age',
                                                   'Vehicle_Damage'])])),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, c...
                               feature_types=None, gamma=None, grow_polic

Parameters: { "use_label_encoder" } are not used.



['model.pkl']