# 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data = df[['Age', "Fare", 'Embarked', "Sex", 'Pclass', "Survived"]]

In [4]:
y = data["Survived"]
X = data.drop(['Survived'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=300, random_state=123)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PowerTransformer

In [7]:
num_columns = ['Age', 'Fare', 'Pclass']
cat_columns = ['Embarked', 'Sex']

In [8]:
X_train.head()

Unnamed: 0,Age,Fare,Embarked,Sex,Pclass
369,24.0,69.3,C,female,1
597,49.0,0.0,S,male,3
21,34.0,13.0,S,male,2
706,45.0,13.5,S,female,2
387,36.0,13.0,S,female,2


In [9]:
imputer = SimpleImputer()

imputer.fit(X_train[num_columns])

In [10]:
imputer.transform(X_train[num_columns])

array([[24.   , 69.3  ,  1.   ],
       [49.   ,  0.   ,  3.   ],
       [34.   , 13.   ,  2.   ],
       ...,
       [32.   ,  7.925,  3.   ],
       [30.   ,  7.25 ,  3.   ],
       [29.   ,  7.75 ,  3.   ]])

In [11]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

encoder.fit(X_train[cat_columns])

In [12]:
encoder.transform(X_train[cat_columns])

array([[0., 0., 0., 0.],
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       ...,
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 0., 1.]])

In [13]:
encoder.get_feature_names_out()

array(['Embarked_Q', 'Embarked_S', 'Embarked_nan', 'Sex_male'],
      dtype=object)

In [14]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('power_transformer', PowerTransformer()),
    ('scaler', StandardScaler())
])

num_pipe.fit_transform (X_train[num_columns])

array([[-0.42552101,  1.28163762, -1.47054633],
       [ 1.45458609, -3.47717209,  0.8509416 ],
       [ 0.35971996, -0.26846798, -0.62449275],
       ...,
       [ 0.20692583, -0.7522234 ,  0.8509416 ],
       [ 0.05217142, -0.83920875,  0.8509416 ],
       [-0.02598824, -0.77405592,  0.8509416 ]])

In [15]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
     ])

In [16]:
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_columns),
    ('cat', cat_pipe, cat_columns)
])

In [17]:
preprocessor.fit(X_train)

In [18]:
preprocessor.transform(X_train).round(2)

array([[-0.43,  1.28, -1.47,  0.  ,  0.  ,  0.  ],
       [ 1.45, -3.48,  0.85,  0.  ,  1.  ,  1.  ],
       [ 0.36, -0.27, -0.62,  0.  ,  1.  ,  1.  ],
       ...,
       [ 0.21, -0.75,  0.85,  0.  ,  1.  ,  1.  ],
       [ 0.05, -0.84,  0.85,  0.  ,  1.  ,  1.  ],
       [-0.03, -0.77,  0.85,  1.  ,  0.  ,  1.  ]])

In [19]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

pipeline

In [20]:
pipeline.fit(X_train, y_train)

In [21]:
pipeline.predict(X_test)

array([1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1], d

In [22]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.8033333333333333

In [23]:
from sklearn.model_selection import GridSearchCV, KFold

In [24]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__num__power_transformer': [PowerTransformer(), "passthrough"],
    'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler()],
    'model__C': [0.1, 1, 2, 10]
}

In [25]:
cv = KFold(n_splits=10, shuffle=True)
optimizer = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
optimizer.fit(X_train, y_train)

In [26]:
optimizer.best_params_

{'model__C': 0.1,
 'preprocessor__num__imputer__strategy': 'mean',
 'preprocessor__num__power_transformer': 'passthrough',
 'preprocessor__num__scaler': StandardScaler()}

In [27]:
accuracy_score(y_test, optimizer.best_estimator_.predict(X_test))

0.81

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin

In [29]:
import numpy as np

A = np.array([[1, 13],
              [2,  7]])


In [30]:
from sklearn.base import OneToOneFeatureMixin

In [75]:
class MyStandardScaler(OneToOneFeatureMixin, BaseEstimator, TransformerMixin):

    def __init__(self, with_mean=True):
        self.with_mean = with_mean

    def fit(self, X, y=None): # Gromadzenie informacji
        self._save_info_about_input(X, y)
        self._caluculate_stats(X)
        return self

    def _save_info_about_input(self, X, y=None):
        self.n_features_in_ = X.shape[1]
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns

    def _caluculate_stats(self, X):
        if self.with_mean:
            seself.means_ = X.mean(axis=0)
        self.stds_ = X.std(axis=0)

    def transform(self, X): # Wykonanie transformacji
        if X.shape[1] !=self.n_features_in_:
            raise ValueError("Wrong number of columns")
        if self.with_mean:
            return (X - self.means_) / self.stds_
        else: 
            return X / self.stds_

In [44]:
scaler = MyStandardScaler()


In [45]:
scaler.fit(A)

In [46]:
scaler.stds_, scaler.means_

(array([0.5, 3. ]), array([ 1.5, 10. ]))

In [47]:
scaler.transform(A)

array([[-1.,  1.],
       [ 1., -1.]])

In [48]:
scaler = MyStandardScaler(with_mean=True)
scaler.fit(A)
scaler.transform(A)

array([[-1.,  1.],
       [ 1., -1.]])

In [60]:
scaler = MyStandardScaler(with_mean=False)
scaler.fit(A)
scaler.fit_transform(A)

array([[2.        , 4.33333333],
       [4.        , 2.33333333]])

In [61]:
scaler.get_params()

{'with_mean': False}

In [73]:
scaler = MyStandardScaler(with_mean=False)
scaler.fit(A)
scaler.get_feature_names_out(A)

array([[1, 13],
       [2, 7]], dtype=object)

In [63]:
scaler.get_feature_names_out(A)

array([[1, 13],
       [2, 7]], dtype=object)

In [64]:
A.shape[1]

2

In [65]:
A

array([[ 1, 13],
       [ 2,  7]])

In [77]:
B = np.array([[1], [2]])
B

array([[1],
       [2]])

In [78]:
scaler = MyStandardScaler(with_mean=False)
scaler.fit(B)
scaler.transform(B)

array([[2.],
       [4.]])

In [79]:
scaler.transform(A)

ValueError: Wrong number of columns