In [None]:
FEATURE EXTRACTION
- CORE PART OF DATA PREPROCESSING OR ML PIPELINE

1. GENERATE SYNTHETIC DATA
2. EXTRACT/ENCODING
    A) STATISTICAL FEATURES[MEAN, STD....]
    B) DATA/TIME FEATURES
    C) ENCODING CATEGORICAL FEATURES
    D) TEXT FEATURE EXTRACTION USING TF-IDF

In [1]:
import numpy as np # numerical operations
import pandas as pd # habdling tabular data

In [2]:
np.random.seed(42)

In [5]:
data = {
    'id': range(1,101),
    'age': np.random.randint(18,60,100),
    'income': np.random.normal(50000, 15000, 100),
    'signup_date': pd.date_range(start = '2002-01-01', periods = 100, freq='D'),
    'category':  np.random.choice(['A','B','C'],100),
    'feedback':  np.random.choice(['Great Product','Not Good','Average Experience','Loved it','Bad Service'],100)
    
}
data

{'id': range(1, 101),
 'age': array([56, 46, 32, 25, 38, 56, 36, 40, 28, 28, 41, 53, 57, 41, 20, 39, 19,
        41, 47, 55, 19, 38, 50, 29, 39, 42, 44, 59, 45, 33, 32, 20, 54, 24,
        38, 26, 56, 35, 21, 42, 31, 26, 43, 19, 37, 45, 24, 25, 52, 31, 34,
        53, 57, 21, 19, 23, 59, 21, 46, 35, 43, 51, 27, 53, 31, 48, 32, 25,
        31, 40, 57, 38, 33, 35, 41, 43, 42, 58, 46, 32, 18, 42, 24, 26, 41,
        18, 25, 41, 28, 34, 25, 52, 52, 50, 22, 59, 56, 58, 45, 24]),
 'income': array([47476.92367851, 67471.52968257, 46313.78468859, 38454.98339501,
        68182.585132  , 68502.45206815, 26387.67069584, 59569.29414837,
        19333.6546727 , 66042.21329295, 35993.55738632, 61763.12281481,
        40776.86360874, 54993.38660733, 29289.24463592, 45787.9056271 ,
        49102.74104507, 64417.60649628, 76914.21264513, 58710.34312595,
        54464.75672763, 34578.26344119, 28721.05305477, 52855.05477448,
        52036.30744215, 59121.34493001, 60574.71966999, 55413.85066387,
       

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,56,47476.923679,2002-01-01,C,Great Product
1,2,46,67471.529683,2002-01-02,C,Great Product
2,3,32,46313.784689,2002-01-03,A,Great Product
3,4,25,38454.983395,2002-01-04,A,Average Experience
4,5,38,68182.585132,2002-01-05,C,Bad Service
...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service
96,97,56,50195.508496,2002-04-07,C,Loved it
97,98,58,55666.543880,2002-04-08,B,Not Good
98,99,45,50942.368996,2002-04-09,B,Loved it


In [9]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,56,47476.923679,2002-01-01,C,Great Product
1,2,46,67471.529683,2002-01-02,C,Great Product
2,3,32,46313.784689,2002-01-03,A,Great Product
3,4,25,38454.983395,2002-01-04,A,Average Experience
4,5,38,68182.585132,2002-01-05,C,Bad Service


In [11]:
df.tail()

Unnamed: 0,id,age,income,signup_date,category,feedback
95,96,59,42163.077807,2002-04-06,C,Bad Service
96,97,56,50195.508496,2002-04-07,C,Loved it
97,98,58,55666.54388,2002-04-08,B,Not Good
98,99,45,50942.368996,2002-04-09,B,Loved it
99,100,24,57523.94548,2002-04-10,B,Not Good


In [15]:
# Statistical Feature Extraction - insight -> how much income per unit age
df['income_per_age'] = df['income']/df['age']
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556
...,...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service,714.628437
96,97,56,50195.508496,2002-04-07,C,Loved it,896.348366
97,98,58,55666.543880,2002-04-08,B,Not Good,959.767998
98,99,45,50942.368996,2002-04-09,B,Loved it,1132.052644


In [17]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556


In [19]:
# Compute Z-score of Income -> how far each income value is from the mean in terms of SD
df['income_zscore'] = (df['income']-df['income'].mean())/df['income'].std()
df


Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677
...,...,...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service,714.628437,-0.542227
96,97,56,50195.508496,2002-04-07,C,Loved it,896.348366,0.004774
97,98,58,55666.543880,2002-04-08,B,Not Good,959.767998,0.377346
98,99,45,50942.368996,2002-04-09,B,Loved it,1132.052644,0.055634


In [21]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677


In [23]:
df.describe()

Unnamed: 0,id,age,income,signup_date,income_per_age,income_zscore
count,100.0,100.0,100.0,100,100.0,100.0
mean,50.5,37.91,50125.408702,2002-02-19 12:00:00,1457.780667,-1.003642e-15
min,1.0,18.0,12518.612875,2002-01-01 00:00:00,315.96383,-2.560987
25%,25.75,26.75,40689.408589,2002-01-25 18:00:00,1026.038951,-0.6425827
50%,50.5,38.0,49719.768392,2002-02-19 12:00:00,1364.627055,-0.02762372
75%,75.25,46.25,60624.020802,2002-03-16 06:00:00,1687.170875,0.7149456
max,100.0,59.0,82380.92785,2002-04-10 00:00:00,3691.551825,2.196571
std,29.011492,12.219454,14684.490603,,639.850775,1.0


In [27]:
 df[['age','income','income_per_age','income_zscore']].describe() # df[...] df[[...,...,...,..]]

Unnamed: 0,age,income,income_per_age,income_zscore
count,100.0,100.0,100.0,100.0
mean,37.91,50125.408702,1457.780667,-1.003642e-15
std,12.219454,14684.490603,639.850775,1.0
min,18.0,12518.612875,315.96383,-2.560987
25%,26.75,40689.408589,1026.038951,-0.6425827
50%,38.0,49719.768392,1364.627055,-0.02762372
75%,46.25,60624.020802,1687.170875,0.7149456
max,59.0,82380.92785,3691.551825,2.196571


In [29]:
# Date/Time Feature Extraction
df['signup_day'] = df['signup_date'].dt.day 

In [31]:
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5
...,...,...,...,...,...,...,...,...,...
95,96,59,42163.077807,2002-04-06,C,Bad Service,714.628437,-0.542227,6
96,97,56,50195.508496,2002-04-07,C,Loved it,896.348366,0.004774,7
97,98,58,55666.543880,2002-04-08,B,Not Good,959.767998,0.377346,8
98,99,45,50942.368996,2002-04-09,B,Loved it,1132.052644,0.055634,9


In [33]:
df['signup_month'] = df['signup_date'].dt.month 

In [35]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4,1
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1


In [37]:
df['signup_weekday'] = df['signup_date'].dt.weekday

In [39]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4,1,4
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5


In [43]:
df['days_from_signup'] =   (pd.Timestamp.today()- df['signup_date']).dt.days
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1,8499
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2,8498
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3,8497
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4,1,4,8496
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5,8495


In [47]:
df['years_from_signup'] =   ((pd.Timestamp.today()- df['signup_date']).dt.days)/365
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,1,8499,23.284932
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,2,8498,23.282192
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,3,8497,23.279452
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4,1,4,8496,23.276712
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,5,8495,23.273973


In [51]:
#Ecode Categorical Variables -> one-hot encoding
df_encoded=pd.get_dummies(df, columns = ['category'], prefix='cat')

In [55]:
df_encoded.head()

Unnamed: 0,id,age,income,signup_date,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup,cat_A,cat_B,cat_C
0,1,56,47476.923679,2002-01-01,Great Product,847.802209,-0.180359,1,1,1,8499,23.284932,False,False,True
1,2,46,67471.529683,2002-01-02,Great Product,1466.772384,1.181255,2,1,2,8498,23.282192,False,False,True
2,3,32,46313.784689,2002-01-03,Great Product,1447.305772,-0.259568,3,1,3,8497,23.279452,True,False,False
3,4,25,38454.983395,2002-01-04,Average Experience,1538.199336,-0.794745,4,1,4,8496,23.276712,True,False,False
4,5,38,68182.585132,2002-01-05,Bad Service,1794.278556,1.229677,5,1,5,8495,23.273973,False,False,True


In [59]:
#Text Feature Extraction(TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['feedback'])

In [65]:
print(tfidf_matrix)

  (0, 4)	0.7071067811865476
  (0, 8)	0.7071067811865476
  (1, 4)	0.7071067811865476
  (1, 8)	0.7071067811865476
  (2, 4)	0.7071067811865476
  (2, 8)	0.7071067811865476
  (3, 0)	0.7071067811865475
  (3, 2)	0.7071067811865475
  (4, 1)	0.7071067811865476
  (4, 9)	0.7071067811865476
  (5, 7)	0.7071067811865476
  (5, 3)	0.7071067811865476
  (6, 7)	0.7071067811865476
  (6, 3)	0.7071067811865476
  (7, 0)	0.7071067811865475
  (7, 2)	0.7071067811865475
  (8, 7)	0.7071067811865476
  (8, 3)	0.7071067811865476
  (9, 4)	0.7071067811865476
  (9, 8)	0.7071067811865476
  (10, 1)	0.7071067811865476
  (10, 9)	0.7071067811865476
  (11, 6)	0.7071067811865476
  (11, 5)	0.7071067811865476
  (12, 7)	0.7071067811865476
  :	:
  (87, 3)	0.7071067811865476
  (88, 7)	0.7071067811865476
  (88, 3)	0.7071067811865476
  (89, 1)	0.7071067811865476
  (89, 9)	0.7071067811865476
  (90, 6)	0.7071067811865476
  (90, 5)	0.7071067811865476
  (91, 4)	0.7071067811865476
  (91, 8)	0.7071067811865476
  (92, 1)	0.7071067811865476

In [67]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),columns=vectorizer.get_feature_names_out())

In [69]:
tfidf_df

Unnamed: 0,average,bad,experience,good,great,it,loved,not,product,service
0,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000
3,0.707107,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107
...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107
96,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107,0.000000,0.000000,0.000000
97,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000,0.000000
98,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107,0.000000,0.000000,0.000000


In [71]:
df_combined = pd.concat([df,tfidf_df], axis = 1)

In [73]:
df_combined.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,...,average,bad,experience,good,great,it,loved,not,product,service
0,1,56,47476.923679,2002-01-01,C,Great Product,847.802209,-0.180359,1,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
1,2,46,67471.529683,2002-01-02,C,Great Product,1466.772384,1.181255,2,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
2,3,32,46313.784689,2002-01-03,A,Great Product,1447.305772,-0.259568,3,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
3,4,25,38454.983395,2002-01-04,A,Average Experience,1538.199336,-0.794745,4,1,...,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,38,68182.585132,2002-01-05,C,Bad Service,1794.278556,1.229677,5,1,...,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107


In [79]:
print(df_combined[['feedback']+ list(tfidf_df.columns)].head())

             feedback   average       bad  experience  good     great   it  \
0       Great Product  0.000000  0.000000    0.000000   0.0  0.707107  0.0   
1       Great Product  0.000000  0.000000    0.000000   0.0  0.707107  0.0   
2       Great Product  0.000000  0.000000    0.000000   0.0  0.707107  0.0   
3  Average Experience  0.707107  0.000000    0.707107   0.0  0.000000  0.0   
4         Bad Service  0.000000  0.707107    0.000000   0.0  0.000000  0.0   

   loved  not   product   service  
0    0.0  0.0  0.707107  0.000000  
1    0.0  0.0  0.707107  0.000000  
2    0.0  0.0  0.707107  0.000000  
3    0.0  0.0  0.000000  0.000000  
4    0.0  0.0  0.000000  0.707107  
