# Preprocessing


In [107]:
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [108]:
fdata = pd.read_csv('../data/Dataset-SA.csv')
rdata = pd.read_csv(r'..\data\amazon_alexa.tsv', delimiter = '\t', quoting=3)

In [109]:
fdata.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [110]:
rdata.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"""Sometimes while playing a game, you can answe...",1
3,5,31-Jul-18,Charcoal Fabric,"""I have had a lot of fun with this thing. My 4...",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [111]:
feedback_counts = rdata['feedback'].value_counts()
feedback_counts

1    2893
0     257
Name: feedback, dtype: int64

In [112]:
# Separate majority and minority classes
feedback_0 = rdata[rdata['feedback'] == 0]
feedback_1 = rdata[rdata['feedback'] == 1]

# Oversample minority class
if len(feedback_0)+500 > len(feedback_1):
    feedback_1_upsampled = resample(feedback_1, replace=True, n_samples=len(feedback_0), random_state=42)
    balanced_data = pd.concat([feedback_0, feedback_1_upsampled])
else:
    feedback_0_upsampled = resample(feedback_0, replace=True, n_samples=len(feedback_1), random_state=42)
    balanced_data = pd.concat([feedback_1, feedback_0_upsampled])

balanced_data = balanced_data.sample(frac=1, random_state=42) 
rdata = balanced_data

In [113]:
rdata.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
2932,1,30-Jul-18,Black Dot,This device does not interact with my home fil...,0
2390,2,30-Jul-18,Configuration: Fire TV Stick,I would not recommend this to anyone. It won't...,0
1128,5,29-Jul-18,Black Spot,"""In a matter of minutes it became indispensabl...",1
1848,5,28-Jul-18,Black Plus,Works great in my large open room,1
159,5,30-Jul-18,Oak Finish,Versatile and fun !!!,1


In [114]:
rdata['length'] = rdata['verified_reviews'].apply(len)

In [115]:
rdata.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,length
2932,1,30-Jul-18,Black Dot,This device does not interact with my home fil...,0,89
2390,2,30-Jul-18,Configuration: Fire TV Stick,I would not recommend this to anyone. It won't...,0,333
1128,5,29-Jul-18,Black Spot,"""In a matter of minutes it became indispensabl...",1,539
1848,5,28-Jul-18,Black Plus,Works great in my large open room,1,33
159,5,30-Jul-18,Oak Finish,Versatile and fun !!!,1,21


In [116]:
rdata['feedback'].value_counts()

0    2893
1    2893
Name: feedback, dtype: int64

In [117]:
lb = LabelEncoder()
fdata['Sentiment']= lb.fit_transform(fdata['Sentiment'])

In [118]:
data

array([2, 2, 2, ..., 2, 2, 1])

In [119]:
fdata.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,2
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,2
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,2
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,0
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,1


In [120]:
fdata['Sentiment'].value_counts()

2    166581
0     28232
1     10239
Name: Sentiment, dtype: int64

In [121]:
fdata.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,2
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,2
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,2
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,0
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,1


In [134]:
from sklearn.utils import resample
sa_df = fdata
alexa_df = rdata
# Rename Sentiment to feedback
sa_df.rename(columns={'Sentiment': 'feedback', 'Review':'verified_reviews'}, inplace=True)

# Swap the values 1 and 2 in place
sa_df['feedback'].replace({1: 2, 2: 1}, inplace=True)


# # Select necessary columns
# sa_df_balanced = sa_df_balanced[['verified_reviews', 'feedback']]



# # Combine datasets
# combined_df = pd.concat([alexa_df, sa_df_balanced], ignore_index=True)

# combined_df.head(), combined_df['feedback'].value_counts()


In [135]:
sa_df.head()

Unnamed: 0,product_name,product_price,Rate,verified_reviews,Summary,feedback
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,1
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,1
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,1
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,0
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,2


In [136]:
sa_df['feedback'].value_counts()

1    166581
0     28232
2     10239
Name: feedback, dtype: int64

In [137]:
# Reduce skewness in feedback column
# Separate each class
positive_df = sa_df[sa_df['feedback'] == 1]
negative_df = sa_df[sa_df['feedback'] == 0]
neutral_df = sa_df[sa_df['feedback'] == 2]

In [138]:
# Determine the size of each class
n_samples = min(len(positive_df), len(negative_df), len(neutral_df))

In [139]:
# Downsample each class to match the smallest class
positive_df_downsampled = resample(positive_df, replace=False, n_samples=n_samples, random_state=42)
negative_df_downsampled = resample(negative_df, replace=False, n_samples=n_samples, random_state=42)
neutral_df_downsampled = resample(neutral_df, replace=False, n_samples=n_samples, random_state=42)

In [142]:
sa_df['verified_reviews'].value_counts()

wonderful                                     9016
not specified                                 8351
brilliant                                     5643
awesome                                       5636
great product                                 5622
                                              ... 
nice to see this product in very less rate       1
best deal in market !!                           1
amazing good for its price                       1
amazing product for its price!!!                 1
product looking very nice & strong quality       1
Name: verified_reviews, Length: 1324, dtype: int64

In [143]:
# Concatenate downsampled classes
sa_df_balanced = pd.concat([positive_df_downsampled, negative_df_downsampled, neutral_df_downsampled])

In [144]:
# Select necessary columns
sa_df_balanced = sa_df_balanced[['verified_reviews', 'feedback']]

In [145]:
# Ensure columns match and prepare for merging
alexa_df = alexa_df[['verified_reviews', 'feedback']]

In [146]:
# Combine datasets
combined_df = pd.concat([alexa_df, sa_df_balanced])

In [147]:
combined_df.shape

(36503, 2)

In [149]:
combined_df['feedback'].value_counts()

0    13132
1    13132
2    10239
Name: feedback, dtype: int64

In [150]:
combined_df.head()

Unnamed: 0,verified_reviews,feedback
2932,This device does not interact with my home fil...,0
2390,I would not recommend this to anyone. It won't...,0
1128,"""In a matter of minutes it became indispensabl...",1
1848,Works great in my large open room,1
159,Versatile and fun !!!,1


In [151]:
combined_df.to_csv('../data/combinedData')