In [1]:
# Importing Feature Engineering Functions

from Feature_Functions import (
    calculate_helpful_ratio,
    count_pos_tags,
    word_count,
    sentence_count,
    average_words_per_sentence,
    title_length,
    calculate_flesch_kincaid,
    calculate_review_extremity,
    calculate_elapsed_time,
    image_check,
    extract_timestamp,
    feature_building
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulahofmann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import pandas as pd
# Importing Data
data_hedonic_total = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/DataBase/WithMeta/Meta_senti_hedonic.csv')
data_hedonic_total.drop (columns=['Unnamed: 0'], inplace=True)



## 1. Building Features 

Building features for each Product Category and Product, using automatically feature building function from the modul Feature Functions, which adds the necessary 12 Features for Model Training to the function. 

These are the added features:
* Helpful Ratio (HR):
Calculates the ratio of helpful votes for each review relative to the total helpful votes across all reviews.
* POS Tag Counts:
Counts the number of adverbs, adjectives, and nouns in each review text.
* Word Count:
Calculates the total number of words in each review text.
* Sentence Count:
Counts the total number of sentences in each review text.
* Average Words per Sentence:
Calculates the average number of words per sentence in each review text.
* Title Length (TL):
Counts the number of characters in the title of each review. If the title is empty or consists only of special characters, it sets the length to 1.
* Flesch-Kincaid Readability Score:
Calculates the Flesch-Kincaid readability score for each review text.
* Review Extremity:
Calculates the difference between the review rating and the average product rating.
* Elapsed Time:
Calculates the elapsed time (in days) since each review was posted.
* Image Check:
Checks whether each review contains images and assigns a binary value (0 for no images, 1 for images).

In [5]:
# Checking for NaN Values in the text column and deleting them
data_hedonic_total = data_hedonic_total.dropna(subset=['text'])
data_hedonic_total = data_hedonic_total.dropna(subset=['title_x'])

#data_utilitarian_razor = data_utilitarian_razor.dropna(subset=['text'])
#data_utilitarian_razor = data_utilitarian_razor.dropna(subset=['title_x'])

#data_utilitarian_filter = data_utilitarian_filter.dropna(subset=['text'])
#data_utilitarian_filter = data_utilitarian_filter.dropna(subset=['title_x'])

In [6]:
data_hedonic_total['timestamp'] = pd.to_datetime(data_hedonic_total['timestamp'])

In [7]:
# Adding Features to Data Hedonic Parfum
feature_building (data_hedonic_total) 

Unnamed: 0,parent_asin,rating,title_x,text,images,asin,user_id,timestamp,helpful_vote,verified_purchase,...,avg_words_per_sentence,title_length,F-K_score,review_extremity,elapsed_time_days,image,year,month,day,hour
0,B09GWLJPTH,5.0,Wow,I’ve never used cuticle oil before so I don’t ...,[],B00F644L32,AEYORY2AVPMCPDV57CE337YU5LXA,2020-06-07 23:20:39.290,0,True,...,11.250000,3,,0.3,1186,0,2020,6,7,23
1,B09GWLJPTH,5.0,Helps those pesky hang nails,I’ve got this oil in dropper bottles scattered...,[],B00F644LTQ,AEYS7TG2WOZAB7NLRVGTM5XZIG3Q,2019-10-19 02:49:44.538,0,True,...,14.500000,1,,0.3,1419,0,2019,10,19,2
2,B09GWLJPTH,4.0,I so see improvement,"I have seen a difference in my cuticles, the l...",[],B00F644LTQ,AGBALMWEDCZRIX2QENGWDZET3EFQ,2022-06-21 13:49:36.220,0,True,...,39.000000,1,,-0.7,442,0,2022,6,21,13
3,B09GWLJPTH,5.0,Great price for a big bottle.,I was surprised when I got this bottle didn't ...,[],B00F644LTQ,AE7ZR5DNB5MPPJ3KNECRHSDEDPUQ,2020-03-26 16:13:00.395,0,True,...,13.666667,1,,0.3,1259,0,2020,3,26,16
4,B09GWLJPTH,4.0,Softens,Smells great and does soften the cuticles but ...,[],B00F644LTQ,AFV22L7AEKI2LW6HMLRLUKNYVBGQ,2020-09-11 13:35:18.780,0,True,...,22.000000,7,,-0.7,1090,0,2020,9,11,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30490,B0BLGN9N39,5.0,Smell like a goddess,This is a game changer Sarah Moe sent you’ll b...,[],B00EJDG7XI,AFQEYDFPFWJJ45A3IYJL5GY64GGA,2023-05-24 14:31:14.382,0,True,...,21.000000,1,,1.1,105,0,2023,5,24,14
30491,B0BLGN9N39,1.0,It stinks.,I got this product to see if my husband would ...,[],B00EJDG7XI,AEAVEGPFLZXERXUCOWT3Z24BRUVQ,2023-04-26 03:31:41.502,1,True,...,16.285714,1,6.122571,-2.9,134,0,2023,4,26,3
30492,B0BLGN9N39,5.0,10/10,"I bought this, thinking “why not” and IT WORKS...",[],B00EJDG7XI,AFMCUM3RKLUFNPRAOPUUUVGXVZYA,2023-08-28 17:34:00.320,0,True,...,12.333333,1,,1.1,9,0,2023,8,28,17
30493,B0BLGN9N39,5.0,Smells great!,I'm not sure if it actually works but it's sme...,[],B00EJDG7XI,AEQZNTPDETRO54BTPKLDUCBXBBOA,2023-04-21 17:36:25.631,1,True,...,14.000000,1,,1.1,138,0,2023,4,21,17


In [8]:
data_hedonic_total.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_hedonic_total.csv', index=False)

In [None]:
# Adding Features to Data Utilitarian Filter
feature_building (data_utilitarian_filter) 


In [None]:
data_utilitarian_filter.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_filter.csv',index=False)

In [None]:
# Adding Features to Data Utilitarian Razor
feature_building (data_utilitarian_razor) 

In [None]:
data_utilitarian_razor.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_razor.csv',index=False)

In [None]:
## Summarizing all Features in a List

input_features = ['rating','rating_number','timestamp', 'sentiment', 'price', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
                  'sentence_count', 'avg_words_per_sentence', 'title_length', 'F-K_score', 'review_extremity', 
                  'elapsed_time_days', 'image', 'year','month','day','hour']

output_feature = 'helpful_ratio'