Good notebooks
https://github.com/bertcarremans/TwitterUSAirlineSentiment/tree/master/source
https://towardsdatascience.com/sentiment-analysis-with-text-mining-13dd2b33de27

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [2]:
a = pd.Series(['1', '2', '3.5', '347', '$123', '34.5', '345,6'])
df = pd.DataFrame(a, columns = ['rawSalary'])
df['type'] = ['US', 'China','China','England','US','US','US']
df

Unnamed: 0,rawSalary,type
0,1,US
1,2,China
2,3.5,China
3,347,England
4,$123,US
5,34.5,US
6,3456,US


- method 1: use FunctionTransformer and ColumnTransformer
- method 2: use FunctionTransformer and FeatureUnion
- method 3: use user defined transformer and FeatureUnion

> method 1: use FunctionTransformer and ColumnTransformer

In [14]:
def clean_string(column):
    # change a dataframe to a pd.Series
    column = column.iloc[:,0]
    # return to a dataframe
    return column.str.replace(',','')\
                              .str.replace('$','')\
                              .str.replace('£','')\
                              .astype('float').to_frame()


In [15]:
from sklearn.preprocessing import FunctionTransformer
get_salary = FunctionTransformer(clean_string, validate= False)
get_salary.fit_transform(df[['rawSalary']])

Unnamed: 0,rawSalary
0,1.0
1,2.0
2,3.5
3,347.0
4,123.0
5,34.5
6,3456.0


In [16]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

cat_steps = [('ohe', ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True))
#             ,('scaler', StandardScaler())
            ]
cat_pipe = Pipeline(cat_steps)
category_lst = ['type']


num_steps = [('get_salary', FunctionTransformer(clean_string, validate= False))]
num_pipe = Pipeline(num_steps)
num_feature_lst = ['rawSalary']

# use ColumnTransformer do transformation separately
transformers = [('cat', cat_pipe, category_lst),
                ('num', num_pipe, num_feature_lst)] # change this list
ct = ColumnTransformer(transformers=transformers)


In [1]:
# # Define a feature extractor to flag very large values
# def more_than_average(X, multiplier=1.0):
#     Z = X.copy()
#     Z[:,1] = Z[:,1] > multiplier*np.mean(Z[:,1])
#     return Z

# # Convert your function so that it can be used in a pipeline
# pipe = Pipeline([
#   ('ft', FunctionTransformer(more_than_average)),
#   ('clf', RandomForestClassifier(random_state=2))])

In [17]:
ct.fit_transform(df)

array([[1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00, 3.500e+00],
       [0.000e+00, 0.000e+00, 1.000e+00, 3.470e+02],
       [1.000e+00, 0.000e+00, 0.000e+00, 1.230e+02],
       [1.000e+00, 0.000e+00, 0.000e+00, 3.450e+01],
       [1.000e+00, 0.000e+00, 0.000e+00, 3.456e+03]])

In [18]:
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_dummy_category = ohe.fit_transform(df[category_lst])
# dummy category column list
dummy_column_lst = X_dummy_category.columns.tolist() # features

# dummy + numerical columns
final_column_lst = dummy_column_lst + num_feature_lst
final_column_lst

['type_US', 'type_China', 'type_England', 'rawSalary']

In [19]:
df_transformed = pd.DataFrame(ct.fit_transform(df), columns = final_column_lst)
df_transformed 

Unnamed: 0,type_US,type_China,type_England,rawSalary
0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,2.0
2,0.0,1.0,0.0,3.5
3,0.0,0.0,1.0,347.0
4,1.0,0.0,0.0,123.0
5,1.0,0.0,0.0,34.5
6,1.0,0.0,0.0,3456.0


> method 2: use FunctionTransformer and FeatureUnion

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler


In [10]:
def clean_string(column):
    # change a dataframe to a pd.Series
    column = column.iloc[:,0]
    # return to a dataframe
    return column.str.replace(',','')\
                              .str.replace('$','')\
                              .str.replace('£','')\
                              .astype('float').to_frame()


In [11]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [12]:
category_lst = ['type']
cat_steps = [('cat_selector',ColumnExtractor(cols=category_lst)),
             ('ohe', ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True))
#             ,('scaler', StandardScaler())
            ]
cat_pipe = Pipeline(cat_steps)


num_feature_lst = ['rawSalary']
num_steps = [('num_selector',ColumnExtractor(cols=num_feature_lst)),
            ('get_salary', FunctionTransformer(clean_string, validate= False))]
num_pipe = Pipeline(num_steps)

features = FeatureUnion([('cat', cat_pipe)
                         , ('num', num_pipe)])

ct = Pipeline([
    ('features', features)
     ])


In [13]:
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_dummy_category = ohe.fit_transform(df[category_lst])
# dummy category column list
dummy_column_lst = X_dummy_category.columns.tolist() # features

# dummy + numerical columns
final_column_lst = dummy_column_lst + num_feature_lst
final_column_lst

df_transformed = pd.DataFrame(ct.fit_transform(df), columns = final_column_lst)
df_transformed 

Unnamed: 0,type_US,type_China,type_England,rawSalary
0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,2.0
2,0.0,1.0,0.0,3.5
3,0.0,0.0,1.0,347.0
4,1.0,0.0,0.0,123.0
5,1.0,0.0,0.0,34.5
6,1.0,0.0,0.0,3456.0


> method 3: use user defined transformer and FeatureUnion

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler


In [4]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
class CleanString(BaseEstimator, TransformerMixin):
    def clean_string(self, value):
        return float(value.replace(',','').replace('$','').replace('£',''))
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        # change a dataframe to a pd.Series
        X = X.iloc[:,0]
        clean_X = X.apply(self.clean_string)
#         clean_X = X.apply(lambda x: self.clean_string(x)).to_frame()
        # must return to a dataframe 2D rather than 1D
        return clean_X.to_frame()

In [6]:
category_lst = ['type']
cat_steps = [('cat_selector',ColumnExtractor(cols=category_lst)),
             ('ohe', ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True))
#             ,('scaler', StandardScaler())
            ]
cat_pipe = Pipeline(cat_steps)


num_feature_lst = ['rawSalary']
num_steps = [
            ('num_selector',ColumnExtractor(cols=num_feature_lst))
            ,('get_salary', CleanString())
            ]
num_pipe = Pipeline(num_steps)

features = FeatureUnion([('cat', cat_pipe)
                         , ('num', num_pipe)
                        ])

ct = Pipeline([
    ('features', features)
     ])


In [7]:
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_dummy_category = ohe.fit_transform(df[category_lst])
# dummy category column list
dummy_column_lst = X_dummy_category.columns.tolist() # features

# dummy + numerical columns
final_column_lst = dummy_column_lst + num_feature_lst
final_column_lst

df_transformed = pd.DataFrame(ct.fit_transform(df), columns = final_column_lst)
df_transformed 

Unnamed: 0,type_US,type_China,type_England,rawSalary
0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,2.0
2,0.0,1.0,0.0,3.5
3,0.0,0.0,1.0,347.0
4,1.0,0.0,0.0,123.0
5,1.0,0.0,0.0,34.5
6,1.0,0.0,0.0,3456.0


In [None]:
- alternative method: method 4

In [8]:
"""
Alternatively, we can use this method
"""
class CleanStringNew(BaseEstimator, TransformerMixin):
    def clean_string(self, value):
        return float(value.replace(',','').replace('$','').replace('£',''))
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X['rawSalary'].apply(self.clean_string)
        return clean_X.to_frame()
    
category_lst = ['type']
cat_steps = [('cat_selector',ColumnExtractor(cols=category_lst)),
             ('ohe', ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True))
#             ,('scaler', StandardScaler())
            ]
cat_pipe = Pipeline(cat_steps)


# num_feature_lst = ['rawSalary']
# num_steps = [
#             ('num_selector',ColumnExtractor(cols=num_feature_lst))
#             ,('get_salary', CleanString())
#             ]
# num_pipe = Pipeline(num_steps)

features = FeatureUnion([('cat', cat_pipe)
#                          , ('num', num_pipe)
                          , ('num', CleanStringNew())
                        ])

ct = Pipeline([
    ('features', features)
     ])

ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_dummy_category = ohe.fit_transform(df[category_lst])
# dummy category column list
dummy_column_lst = X_dummy_category.columns.tolist() # features

# dummy + numerical columns
final_column_lst = dummy_column_lst + num_feature_lst
final_column_lst

df_transformed = pd.DataFrame(ct.fit_transform(df), columns = final_column_lst)
df_transformed 


Unnamed: 0,type_US,type_China,type_England,rawSalary
0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,2.0
2,0.0,1.0,0.0,3.5
3,0.0,0.0,1.0,347.0
4,1.0,0.0,0.0,123.0
5,1.0,0.0,0.0,34.5
6,1.0,0.0,0.0,3456.0


In [43]:
import sklearn
sklearn.__version__

'0.20.2'