Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,4 @@ jupyterlite_contents

# file recognised by vscode IDEs containing env variables
.env
scikit-learn/
66 changes: 33 additions & 33 deletions build_tools/codespell_ignore_words.txt
Original file line number Diff line number Diff line change
@@ -1,63 +1,63 @@
achin
aggresive
aggressive
aline
ba
basf
boun
base
bound
bre
bu
cach
cant
chanel
can't
channel
complies
coo
copys
datas
deine
didi
feld
coup
copies
data
define
did
field
fo
fpr
fro
fwe
gool
few
ghoul
hart
heping
helping
hist
ines
lines
inout
ist
jaques
lene
lamas
jacques
lens
llamas
linke
lod
mange
mape
manage
map
mis
mor
more
nd
nmae
ocur
name
occur
pullrequest
repid
rapid
ro
ser
set
soler
staps
suh
suprised
such
surprised
te
technic
teh
technique
the
theis
thi
usal
vie
vor
usual
via
for
wan
whis
wil
winn
win
whis
yau
9 changes: 9 additions & 0 deletions doc/modules/compose.rst
Original file line number Diff line number Diff line change
Expand Up @@ -613,3 +613,12 @@ As an alternative, the HTML can be written to a file using

* :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py`
* :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`

.. _column_transformer_passthrough:

ColumnTransformer with ``remainder='passthrough'`` and Pandas
-------------------------------------------------------------

.. literalinclude:: ../examples/compose/plot_column_transformer_passthrough.py
:language: python
:lines: 12-
82 changes: 82 additions & 0 deletions examples/compose/plot_column_transformer_passthrough.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# examples/compose/plot_column_transformer_passthrough.py
# -*- coding: utf-8 -*-

"""
=============================================================
ColumnTransformer with remainder='passthrough' and Pandas
=============================================================

This example shows how to keep columns untouched with
``remainder='passthrough'`` while transforming others.
The input is a pandas DataFrame – the most common real-world case.
"""

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# -------------------------------------------------
# 1. Create a realistic Pandas DataFrame
# -------------------------------------------------
data = pd.DataFrame({
"age": [25, 30, 35, 40, 45, 50, 22, 33],
"salary": [50000, 60000, 70000, 80000, 90000, 100000, 45000, 55000],
"city": ["NY", "LA", "NY", "SF", "LA", "NY", "SF", "LA"],
"is_senior":[0, 0, 0, 1, 1, 1, 0, 0],
"target": [0, 1, 0, 1, 1, 0, 0, 1]
})

X = data.drop("target", axis=1)
y = data["target"]

# -------------------------------------------------
# 2. ColumnTransformer – scale numeric, encode city,
# passthrough the binary column `is_senior`
# -------------------------------------------------
ct = ColumnTransformer(
[
("scale", StandardScaler(), ["age", "salary"]),
("encode", OneHotEncoder(drop="first", sparse_output=False), ["city"]),
],
remainder="passthrough", # <-- keeps `is_senior` unchanged
)

# -------------------------------------------------
# 3. Full pipeline + LogisticRegression
# -------------------------------------------------
pipe = Pipeline(
[
("transform", ct),
("clf", LogisticRegression(max_iter=1000)),
]
)

# -------------------------------------------------
# 4. Train / test split & evaluation
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))

# -------------------------------------------------
# 5. Visualise the transformed feature matrix
# -------------------------------------------------
transformed = ct.fit_transform(X)
cols = (
["age_scaled", "salary_scaled"] +
[f"city_{c}" for c in ct.named_transformers_["encode"].get_feature_names_out()] +
["is_senior"]
)
print("\nTransformed features (first 5 rows):")
print(pd.DataFrame(transformed, columns=cols).head())
8 changes: 4 additions & 4 deletions sklearn/feature_extraction/_stop_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"am",
"among",
"amongst",
"amoungst",
"amongst",
"amount",
"an",
"and",
Expand Down Expand Up @@ -69,7 +69,7 @@
"co",
"con",
"could",
"couldnt",
"couldn't",
"cry",
"de",
"describe",
Expand Down Expand Up @@ -119,7 +119,7 @@
"go",
"had",
"has",
"hasnt",
"hasn't",
"have",
"he",
"hence",
Expand Down Expand Up @@ -183,7 +183,7 @@
"no",
"nobody",
"none",
"noone",
"no one",
"nor",
"not",
"nothing",
Expand Down
Loading