In [2]:
# Objective programming 

In [1]:
# Remove warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("students.csv")
df.head()

Unnamed: 0,sex,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,target
0,female,group B,bachelor's degree,standard,none,72,72,74,0
1,female,group C,some college,standard,completed,69,90,88,1
2,female,group B,master's degree,standard,none,90,95,93,0
3,male,group A,associate's degree,free/reduced,none,47,57,44,1
4,male,group C,some college,standard,none,76,78,75,0


In [7]:
len(df), list(df.columns)

(99,
 ['sex',
  'race/ethnicity',
  'parental level of education',
  'lunch',
  'test preparation course',
  'math score',
  'reading score',
  'writing score',
  'target'])

In [8]:
X = df.drop(columns=['target'])
y = df['target']

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numeric_features = ['math score','reading score','writing score']
categorical_features = ['sex','race/ethnicity','parental level of education','lunch','test preparation course']

In [10]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [11]:
preprocessor = ColumnTransformer(transformers=[
    ("num_trans", numeric_transformer, numeric_features),
    ("cat_trans", categorical_transformer, categorical_features)
])

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("model", LogisticRegression())
])

In [12]:
from sklearn import set_config
set_config(display='diagram')
pipeline

In [13]:
from sklearn.model_selection import train_test_split
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

pipeline.fit(X_tr, y_tr)

score = pipeline.score(X_test, y_test)
print(score)

0.45


In [14]:
import joblib
joblib.dump(pipeline, 'your_pipeline.pkl')

['your_pipeline.pkl']

In [15]:
param_grid = [
              {"preproc__num_trans__imputer__strategy":
              ["mean","median"],
               "model__n_estimators":[2,5,10,100,500],
               "model__min_samples_leaf": [1, 0.1],
               "model":[RandomForestClassifier()]},
              {"preproc__num_trans__imputer__strategy":
                ["mean","median"],
               "model__C":[0.1,1.0,10.0,100.0,1000],
                "model":[LogisticRegression()]}
]

from sklearn.model_selection import GridSearchCV


grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)


grid_search.fit(X_tr, y_tr)

grid_search.best_params_

Fitting 2 folds for each of 30 candidates, totalling 60 fits


{'model': RandomForestClassifier(min_samples_leaf=0.1, n_estimators=5),
 'model__min_samples_leaf': 0.1,
 'model__n_estimators': 5,
 'preproc__num_trans__imputer__strategy': 'mean'}

In [16]:
grid_search.score(X_test, y_test), grid_search.score(X_tr, y_tr)

(0.4, 0.620253164556962)

In [17]:
## add one feature, one value
df['bad_feature'] = 1

In [18]:
X = df.drop(columns=['target'])
y = df['target']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

In [19]:
numeric_features = ['math score','reading score','writing score', 'bad_feature']

In [20]:
grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)

grid_search.fit(X_tr, y_tr)

grid_search.best_params_

Fitting 2 folds for each of 30 candidates, totalling 60 fits


{'model': RandomForestClassifier(min_samples_leaf=0.1, n_estimators=2),
 'model__min_samples_leaf': 0.1,
 'model__n_estimators': 2,
 'preproc__num_trans__imputer__strategy': 'median'}

In [None]:
grid_search.score(X_tr, y_tr), grid_search.score(X_test, y_test)

In [21]:
## 创建一个class来 transfer--------理解这个class，这个class可以根据自己的需求来改变
# your own transformator class

from sklearn.base import BaseEstimator, TransformerMixin

class DelOneValueFeature(BaseEstimator, TransformerMixin):
    """Description"""
    def __init__(self):
        self.one_value_features = []
        
    def fit(self, X, y=None):
        for feature in X.columns:
            unique = X[feature].unique()
            if len(unique)==1:
                self.one_value_features.append(feature)
        return self
    def transform(self, X, y=None):
        if not self.one_value_features:
            return X
        return X.drop(axis='columns', columns=self.one_value_features)

In [22]:
pipeline2 = Pipeline([
    ("moja_transformacja",DelOneValueFeature()),
    ("preprocesser", preprocessor),
    ("classifier", LogisticRegression())])
    
pipeline2.fit(X_tr, y_tr)
score2 = pipeline2.score(X_test, y_test)

In [23]:
score2

0.45

In [None]:
# function definition 
def moja_funkcja():
    pass

In [None]:
# class definition
class Nazwa(object):
    pass

In [None]:
a = Nazwa()
a.__dir__()

In [None]:
[Nazwa() for _ in range(10)]

In [None]:
# just for reminde
x = [1,2,3,4,5]
[t**2 for t in x]

In [None]:
from random import randint
class Dice():
    """class description"""
    def __init__(self, wall=6):
        """ ops metody """
        self.wall = wall
        
    def roll(self):
        """opis metody """
        return randint(1,self.wall)

In [None]:
a = Dice()
[a.roll() for _ in range(10)]

In [None]:
# Random Walk

from random import choice

class RandomWalk():
    def __init__(self, num_points=5000):
        self.num_points = num_points
        self.x_values = [0]
        self.y_values = [0]
    
    def fill_walk(self):
        while len(self.x_values) < self.num_points:
            x_direction = choice([-1,1])
            x_distance = choice([0,1,2,3,4])
            x_step = x_direction*x_distance
            
            y_direction = choice([-1,1])
            y_distance = choice([0,1,2,3,4])
            y_step = y_direction*y_distance
            
            if x_step == 0 and y_step == 0:
                continue
            
            next_x = self.x_values[-1] + x_step
            next_y = self.y_values[-1] + y_step
            
            self.x_values.append(next_x)
            self.y_values.append(next_y)

In [None]:
rw = RandomWalk(10000)
rw.x_values

In [None]:
rw.fill_walk()

In [None]:
import matplotlib.pyplot as  plt

point_number = list(range(rw.num_points))

plt.scatter(rw.x_values, rw.y_values, c=point_number, cmap=plt.cm.Blues, edgecolor='none', s=15)
plt.scatter(0,0,c='green', edgecolor='none', s=100)
plt.scatter(rw.x_values[-1], rw.y_values[-1],c='red', edgecolor='none', s=100)
plt.axis('off')
plt.show()