In [1]:
# Importing Feature Engineering Functions

from Feature_Functions import (
    calculate_helpful_ratio,
    count_pos_tags,
    word_count,
    sentence_count,
    average_words_per_sentence,
    title_length,
    calculate_flesch_kincaid,
    calculate_review_extremity,
    calculate_elapsed_time,
    image_check,
    extract_timestamp,
    feature_building
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulahofmann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Importing Libraries
import pandas as pd


In [3]:
# Importing Data
data_hedonic_parfum = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/DataBase/Without Meta/WithMeta/Meta_hedonic_parfum.csv')
data_utilitarian_razor = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/DataBase/Without Meta/WithMeta/Meta_utilitarian_filter.csv')
data_utilitarian_filter = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/DataBase/Without Meta/WithMeta/Meta_utilitarian_razor.csv')

## 1. Building Features 

Building features for each Product Category and Product, using automatically feature building function from the modul Feature Functions, which adds the necessary 12 Features for Model Training to the function. 

These are the added features:
* Helpful Ratio (HR):
Calculates the ratio of helpful votes for each review relative to the total helpful votes across all reviews.
* POS Tag Counts:
Counts the number of adverbs, adjectives, and nouns in each review text.
* Word Count:
Calculates the total number of words in each review text.
* Sentence Count:
Counts the total number of sentences in each review text.
* Average Words per Sentence:
Calculates the average number of words per sentence in each review text.
* Title Length (TL):
Counts the number of characters in the title of each review. If the title is empty or consists only of special characters, it sets the length to 1.
* Flesch-Kincaid Readability Score:
Calculates the Flesch-Kincaid readability score for each review text.
* Review Extremity:
Calculates the difference between the review rating and the average product rating.
* Elapsed Time:
Calculates the elapsed time (in days) since each review was posted.
* Image Check:
Checks whether each review contains images and assigns a binary value (0 for no images, 1 for images).

In [4]:
# Checking for NaN Values in the text column and deleting them
data_hedonic_parfum = data_hedonic_parfum.dropna(subset=['text'])
data_hedonic_parfum = data_hedonic_parfum.dropna(subset=['title_x'])

data_utilitarian_razor = data_utilitarian_razor.dropna(subset=['text'])
data_utilitarian_razor = data_utilitarian_razor.dropna(subset=['title_x'])

data_utilitarian_filter = data_utilitarian_filter.dropna(subset=['text'])
data_utilitarian_filter = data_utilitarian_filter.dropna(subset=['title_x'])

In [6]:
data_hedonic_parfum['timestamp'] = pd.to_datetime(data_hedonic_parfum['timestamp'])
data_utilitarian_filter['timestamp'] = pd.to_datetime(data_utilitarian_filter['timestamp'])
data_utilitarian_razor['timestamp'] = pd.to_datetime(data_utilitarian_razor['timestamp'])


In [7]:
# Adding Features to Data Hedonic Parfum
feature_building (data_hedonic_parfum) 

Unnamed: 0,rating,title_x,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,avg_words_per_sentence,title_length,F-K_score,review_extremity,elapsed_time_days,image,year,month,day,hour
0,2.0,Smells odd,It’s okay not the greatest and doesn’t have an...,[],B00EJDG7XI,B0BLGN9N39,AFSCQMP2EPYLGJN7OBTMEXCBE2OQ,2020-12-26 18:49:48.813,0,True,...,19.000000,1,,-1.9,975,0,2020,12,26,18
1,3.0,Didn’t see the value,I loved the scent going on but in about 3-5 mi...,[],B00EJDG7XI,B0BLGN9N39,AGZMNRT6ZFT7DFQDDTL2NWJGECZQ,2022-03-14 23:32:37.552,0,True,...,16.750000,1,,-0.9,531,0,2022,3,14,23
2,5.0,These men will be all over you. Be careful sis💕,Y’all I barely put a few dabs on my neck and m...,[],B00EJDG7XI,B0BLGN9N39,AEGQBHPAOKIRSBB47KZGTDPPNUTA,2022-03-23 01:34:11.093,17,True,...,16.555556,1,5.529499,1.1,523,0,2022,3,23,1
3,5.0,WHAT?!,Ummm... Sorcery. Sorcery! This is magic and I'...,[],B00EJDG7XI,B0BLGN9N39,AF6L6GCILOYQXOUT2K2MJD7EQFXA,2021-05-25 14:33:14.191,3,True,...,11.466667,1,2.212667,1.1,825,0,2021,5,25,14
4,3.0,Good,Nice throwback to the 2000s,[],B00EJDG7XI,B0BLGN9N39,AHVZVOTCYCNGJIBJC2F5HOYFBKVA,2022-10-20 22:14:00.842,0,True,...,5.000000,4,,-0.9,311,0,2022,10,20,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,5.0,Smell like a goddess,This is a game changer Sarah Moe sent you’ll b...,[],B00EJDG7XI,B0BLGN9N39,AFQEYDFPFWJJ45A3IYJL5GY64GGA,2023-05-24 14:31:14.382,0,True,...,21.000000,1,,1.1,96,0,2023,5,24,14
266,1.0,It stinks.,I got this product to see if my husband would ...,[],B00EJDG7XI,B0BLGN9N39,AEAVEGPFLZXERXUCOWT3Z24BRUVQ,2023-04-26 03:31:41.502,1,True,...,16.285714,1,6.122571,-2.9,124,0,2023,4,26,3
267,5.0,10/10,"I bought this, thinking “why not” and IT WORKS...",[],B00EJDG7XI,B0BLGN9N39,AFMCUM3RKLUFNPRAOPUUUVGXVZYA,2023-08-28 17:34:00.320,0,True,...,12.333333,1,,1.1,0,0,2023,8,28,17
268,5.0,Smells great!,I'm not sure if it actually works but it's sme...,[],B00EJDG7XI,B0BLGN9N39,AEQZNTPDETRO54BTPKLDUCBXBBOA,2023-04-21 17:36:25.631,1,True,...,14.000000,1,,1.1,129,0,2023,4,21,17


In [8]:
data_hedonic_parfum.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_hedonic_parfum.csv', index=False)

In [9]:
# Adding Features to Data Utilitarian Filter
feature_building (data_utilitarian_filter) 


Unnamed: 0,rating,title_x,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,avg_words_per_sentence,title_length,F-K_score,review_extremity,elapsed_time_days,image,year,month,day,hour
0,5.0,Easy to store and easy to use,Just threw away 2 ladies electric shavers. 1 b...,[],B0018A32XS,B0018A32XS,AHZ6XMOLEWA67S3TX7IWEXXGWSOA,2016-06-28 17:28:27.000,1,True,...,10.714286,1,,0.7,2606,0,2016,6,28,17
1,5.0,Very nice product.,Sharp product. Easy to use and well made.,[],B0018A32XS,B0018A32XS,AHACLF2COQQE2V33ZFXQ7THZOJ2Q,2021-08-13 11:43:39.775,0,True,...,5.000000,1,,0.7,735,0,2021,8,13,11
2,4.0,Excellent electric razor,I bought this one for my 13 year old granddaug...,[],B0018A32XS,B0018A32XS,AGHXVVTYB746KOUEYN7DT65QSDRA,2020-07-19 00:11:37.350,0,True,...,12.666667,1,,-0.3,1125,0,2020,7,19,0
3,5.0,very nice. works well,this is an excellent razor. it caught and rem...,[],B0018A32XS,B0018A32XS,AGQFGXUGFSZHNRQGP7J24RVSLZSA,2015-01-21 21:26:44.000,1,True,...,13.000000,1,,0.7,3130,0,2015,1,21,21
4,5.0,Panasonic Electric Razor for Women,Love this electric razor....it did my research...,[],B0018A32XS,B0018A32XS,AEXTSZOMHUDQZ46764RKBNZ2WANA,2016-06-15 15:10:42.000,1,True,...,11.000000,1,,0.7,2619,0,2016,6,15,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11359,5.0,Love It!,Love It!!! Love It!!! Especially when u have n...,[],B0018A32XS,B0018A32XS,AGV2AYUQOVGH2VUAQGVRPVABELJQ,2014-08-30 18:47:45.000,1,True,...,7.750000,1,,0.7,3274,0,2014,8,30,18
11360,4.0,"A good razer, for the price.",It shaves what I need it to and have my skin s...,[],B0018A32XS,B0018A32XS,AFIC2ITMDW3ANECRHYYECHSS4TWA,2019-12-16 01:05:58.913,0,True,...,12.000000,1,,-0.3,1341,0,2019,12,16,1
11361,4.0,Four Stars,Works great! Bette than a razor.,[],B0018A32XS,B0018A32XS,AGUGEKPKCXM35CIIXAN7KHRQTKTA,2015-06-03 17:09:47.000,0,True,...,4.000000,1,,-0.3,2997,0,2015,6,3,17
11362,4.0,Four Stars,Works great!!,[],B0018A32XS,B0018A32XS,AEH7V5X45MTG6AF5KXFFKUOVC74Q,2017-05-18 15:23:27.000,0,True,...,2.000000,1,,-0.3,2282,0,2017,5,18,15


In [16]:
data_utilitarian_filter.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_filter.csv',index=False)

In [10]:
# Adding Features to Data Utilitarian Razor
feature_building (data_utilitarian_razor) 

Unnamed: 0,rating,title_x,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,avg_words_per_sentence,title_length,F-K_score,review_extremity,elapsed_time_days,image,year,month,day,hour
0,5.0,Five Stars,Perfect! Delivered right to my door—no going f...,[],B00UXG4WR8,B00UXG4WR8,AFSJGCYOYHJGG2VBYGB7WLKHQ37A,2018-04-19 13:13:18.309,0,True,...,9.000000,1,,0.3,1953,0,2018,4,19,13
1,5.0,Easy setup and works!,I love how easy this filter replacement is! It...,[],B00UXG4WR8,B00UXG4WR8,AEM54G6GVSBCFVMEWUKELLDH6CEQ,2021-08-16 22:38:55.964,0,True,...,9.333333,1,,0.3,737,0,2021,8,16,22
2,5.0,Too expensive,Works great but costs too much,[],B00UXG4WR8,B00UXG4WR8,AGKSSNLDIG5HY5FXYVE6UVXKVDYQ,2020-12-23 19:18:46.680,0,True,...,6.000000,1,,0.3,973,0,2020,12,23,19
3,5.0,It’s easy to install,Last long and works great,[],B00UXG4WR8,B00UXG4WR8,AHU2GG5RF6YAEWUFNLH3QH5RHDNQ,2020-01-09 15:48:39.341,0,True,...,5.000000,1,,0.3,1323,0,2020,1,9,15
4,2.0,"It's great when it works, but the last two of ...",We've purchased four of these from Amazon. The...,[],B00UXG4WR8,B00UXG4WR8,AGW2YFQE4AOEHCE747ESDJYULJMA,2022-03-12 23:28:36.573,0,True,...,29.250000,1,8.882143,-2.7,529,0,2022,3,12,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860,1.0,Not working,The product stopped working after three months...,[],B00UXG4WR8,B00UXG4WR8,AGRUOYDGGRXUBMCG7DQKOTIV4PNA,2017-07-26 16:16:33.890,0,True,...,15.000000,1,,-3.7,2220,0,2017,7,26,16
5861,5.0,Know the model number of the refrigerator in w...,It matched the filter that came from Whirlpool...,[],B00UXG4WR8,B00UXG4WR8,AGT7DE7EATCGZL4AO2GIEO7JA62A,2018-10-18 19:20:17.982,0,True,...,14.000000,1,,0.3,1770,0,2018,10,18,19
5862,5.0,It works,Expensive but it is the original filter for Wh...,[],B00UXG4WR8,B00UXG4WR8,AEDL4P5GG6LWVFUCQZPS7YK56RCQ,2023-07-02 10:40:24.419,0,True,...,9.000000,1,,0.3,53,0,2023,7,2,10
5863,4.0,Filter...,It was delivered on time and was what I needed.,[],B00UXG4WR8,B00UXG4WR8,AHEGMOPGALEKZJSAXWDTW7Y27GNA,2018-01-29 18:28:43.299,0,True,...,11.000000,1,,-0.7,2032,0,2018,1,29,18


In [15]:
data_utilitarian_razor.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/ModelPreperation/Features_utilitarian_razor.csv',index=False)

In [14]:
## Summarizing all Features in a List

input_features = ['rating','rating_number','timestamp', 'sentiment', 'price', 'noun_count', 'adj_count', 'adv_count', 'word_count', 
                  'sentence_count', 'avg_words_per_sentence', 'title_length', 'F-K_score', 'review_extremity', 
                  'elapsed_time_days', 'image', 'year','month','day','hour']

output_feature = 'helpful_ratio'