In [1]:
import os
import sys
import pandas as pd
import numpy as np

In [38]:
df = pd.DataFrame({
    'x':["Today is Friday", "I can't wait for the weekend","another sentence","tomorrow is saturday","notastopword weekend"],
    'y':["First, I have to work today","Tomorrow will be fun though","why isn't this working?","there appears to be an issue","notastopword issue"]})
df

Unnamed: 0,x,y
0,Today is Friday,"First, I have to work today"
1,I can't wait for the weekend,Tomorrow will be fun though
2,another sentence,why isn't this working?
3,tomorrow is saturday,there appears to be an issue
4,notastopword weekend,notastopword issue


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer()
tfidf2 = TfidfVectorizer()
tfidf1.fit_transform(df.x)
tfidf2.fit_transform(df.y)

<5x19 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [47]:
tfidf2.vocabulary_

{'first': 3,
 'have': 5,
 'to': 12,
 'work': 17,
 'today': 13,
 'tomorrow': 14,
 'will': 16,
 'be': 2,
 'fun': 4,
 'though': 11,
 'why': 15,
 'isn': 6,
 'this': 10,
 'working': 18,
 'there': 9,
 'appears': 1,
 'an': 0,
 'issue': 7,
 'notastopword': 8}

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

In [41]:
make_column_selector(pattern="y")(df)

['y']

In [55]:
p = Pipeline([
    ("tfidf",ColumnTransformer([
    ("xtfidf",VectorizeText(),make_column_selector(pattern="x")),
    ("ytfidf",VectorizeText(),make_column_selector(pattern="y"))
    ],sparse_threshold = 0.0)
    )
])

In [67]:
p.fit_transform(df[["x","y"]])

array([[0.        , 0.        , 0.        , 0.61418897, 0.49552379,
        0.        , 0.        , 0.        , 0.        , 0.61418897,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46369322, 0.        , 0.46369322, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37410477, 0.46369322, 0.        , 0.        , 0.        ,
        0.46369322, 0.        ],
       [0.        , 0.46369322, 0.46369322, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.46369322, 0.        ,
        0.        , 0.46369322, 0.37410477, 0.        , 0.        ,
        0.37410477, 0.        , 0.46369322, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.46369322,
        0.        , 0.        , 0.46369322, 0.        , 0.46369322,
        0.        , 0.        ],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.    

In [66]:
p["tfidf"].transformers_[0][1].vectorizer.get_feature_names()

['another',
 'can',
 'for',
 'friday',
 'is',
 'notastopword',
 'saturday',
 'sentence',
 'the',
 'today',
 'tomorrow',
 'wait',
 'weekend']

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

class VectorizeText(BaseEstimator, TransformerMixin):

    def __init__(self, vectorizer = TfidfVectorizer()):
        self.vectorizer = vectorizer

    def fit(self, X, y = None):
        self.vectorizer = self.vectorizer.fit(X.iloc[:,0], y)
        return self
    
    def transform(self, X):
        variable = X.columns.tolist()[0]
        res = self.vectorizer.transform(X.iloc[:,0])
        res_df = pd.DataFrame(
            res.todense(),
            columns = [variable + "_" + i for i in self.vectorizer.get_feature_names()]
        )
        return res_df

In [3]:
N=1000

In [4]:
df = pd.DataFrame(
{
    "letters": np.random.choice(list("ABCD"),size=N),
    "ints": np.random.randint(0,100,N),
    "nums": np.random.lognormal(size=N)
})

In [5]:
#df.assign(whatever = lambda df: df.groupby("letters")["nums"].transform(lambda x: np.mean(x)))
df.assign(whatever = df.groupby("letters")["nums"].transform(lambda x: np.mean(x)))

Unnamed: 0,letters,ints,nums,whatever
0,C,53,0.798684,1.580267
1,D,18,1.170807,1.688552
2,B,1,0.635620,1.725104
3,B,0,0.533419,1.725104
4,C,46,1.853820,1.580267
...,...,...,...,...
995,A,23,3.397311,1.532860
996,A,99,2.950617,1.532860
997,D,33,0.830190,1.688552
998,D,53,8.235098,1.688552


In [17]:
df.groupby("letters")["nums"].transform(lambda x: np.mean(x))

0      1.564087
1      1.564087
2      1.564087
3      1.615334
4      1.728989
         ...   
995    1.703393
996    1.728989
997    1.703393
998    1.564087
999    1.615334
Name: nums, Length: 1000, dtype: float64