<a href="https://colab.research.google.com/github/PeterPirog/github_stack_scripts/blob/main/01_featureengine_equalfrequency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [191]:
!pip install feature-engine



In [192]:
import numpy as np
import pandas as pd

# Create random dataframe column "T" is our target
df = pd.DataFrame(np.random.randn(100, 5), columns=list('ABCDT'))
df.head()


Unnamed: 0,A,B,C,D,T
0,1.496724,-0.366085,0.099949,-0.265747,-0.694801
1,0.736543,-1.679624,-0.882359,-1.606463,1.167537
2,0.730867,1.351126,1.305285,-1.278935,1.210292
3,0.489433,0.067887,-0.780055,-0.315993,0.457183
4,-3.698852,1.053313,-0.318687,-0.441441,0.147744


In [193]:
features=list('ABCD')
target=['T']

X=df[features]
y=df[target]

In [194]:
from feature_engine.discretisation import EqualFrequencyDiscretiser

# Use discretizer to split data into bins

efd = EqualFrequencyDiscretiser(variables=None, q=5, return_object=True, return_boundaries=False)
X=efd.fit_transform(X)
X.head()

Unnamed: 0,A,B,C,D
0,4,1,2,2
1,3,0,0,0
2,3,4,4,0
3,3,2,1,2
4,0,4,2,1


In [195]:
# Raplace integer 
val_dict={0:'very_low',1:'low',2:'medium',3:'high',4:'very_high'}

X=X.replace(val_dict)
X.head()


Unnamed: 0,A,B,C,D
0,very_high,low,medium,medium
1,high,very_low,very_low,very_low
2,high,very_high,very_high,very_low
3,high,medium,low,medium
4,very_low,very_high,medium,low


In [196]:
X=X.astype(str).apply(lambda x : x.name+ '_' + x)
df[features]=X
df.head()

Unnamed: 0,A,B,C,D,T
0,A_very_high,B_low,C_medium,D_medium,-0.694801
1,A_high,B_very_low,C_very_low,D_very_low,1.167537
2,A_high,B_very_high,C_very_high,D_very_low,1.210292
3,A_high,B_medium,C_low,D_medium,0.457183
4,A_very_low,B_very_high,C_medium,D_low,0.147744


In [197]:
df['Text'] = '' 
for feature in features:
  df['Text']=df['Text'].map(str) + ' '+ df[feature].map(str)

df.head()

Unnamed: 0,A,B,C,D,T,Text
0,A_very_high,B_low,C_medium,D_medium,-0.694801,A_very_high B_low C_medium D_medium
1,A_high,B_very_low,C_very_low,D_very_low,1.167537,A_high B_very_low C_very_low D_very_low
2,A_high,B_very_high,C_very_high,D_very_low,1.210292,A_high B_very_high C_very_high D_very_low
3,A_high,B_medium,C_low,D_medium,0.457183,A_high B_medium C_low D_medium
4,A_very_low,B_very_high,C_medium,D_low,0.147744,A_very_low B_very_high C_medium D_low


In [198]:
dataset=df[['Text','T']].copy()
dataset.head()

Unnamed: 0,Text,T
0,A_very_high B_low C_medium D_medium,-0.694801
1,A_high B_very_low C_very_low D_very_low,1.167537
2,A_high B_very_high C_very_high D_very_low,1.210292
3,A_high B_medium C_low D_medium,0.457183
4,A_very_low B_very_high C_medium D_low,0.147744


In [199]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["Text"].to_numpy(),
                                                                            df["T"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [200]:
from pandas.core.dtypes.cast import maybe_infer_to_datetimelike
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model= Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", ElasticNet()) # model the text
])

# Fit the pipeline to the training data
model.fit(train_sentences, train_labels)


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', ElasticNet())])

In [201]:
out=model.predict(train_sentences)
print(out)

[0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924
 0.19315924 0.19315924 0.19315924 0.19315924 0.19315924 0.1931

In [202]:
model.score(val_sentences,val_labels)

-0.11462844013906581

In [203]:
model['tfidf'].vocabulary_
#embedings for words can be visualized on https://projector.tensorflow.org/
# the goal is to find relationship and similarity between categorical values

{'a_high': 0,
 'a_low': 1,
 'a_medium': 2,
 'a_very_high': 3,
 'a_very_low': 4,
 'b_high': 5,
 'b_low': 6,
 'b_medium': 7,
 'b_very_high': 8,
 'b_very_low': 9,
 'c_high': 10,
 'c_low': 11,
 'c_medium': 12,
 'c_very_high': 13,
 'c_very_low': 14,
 'd_high': 15,
 'd_low': 16,
 'd_medium': 17,
 'd_very_high': 18,
 'd_very_low': 19}

In [204]:
out=model['tfidf'].transform(train_sentences)
out


<90x20 sparse matrix of type '<class 'numpy.float64'>'
	with 360 stored elements in Compressed Sparse Row format>