In [1]:
import pandas as pd

In [3]:
review_data = {
    'ReviewID': [1, 2, 3, 4, 5, 6, 7],
    'Product':  ['Laptop', 'Mouse', 'Keyboard',
                'Monitor', 'Webcam', 'Laptop', 'Mouse'],
    'Category': ['Electronics', 'electronics', 'ELECTRONICS ',
                 'Home goods', 'Home Goods', 'Electronics', 'ElecTRONICS'], 
    'Reviewer_Location': ['New York', 'NY', '  new york', 'California',
                          'CA', 'New York', 'ca'], 
    'Rating': [5, 4, 5, 3, 4, 5, 4]
}
df_reviews = pd.DataFrame(review_data)

In [4]:
print("Original Review Data (with inconsistencies):")
print(df_reviews)
print("\nOriginal Data Info:")
df_reviews.info()


Original Review Data (with inconsistencies):
   ReviewID   Product      Category Reviewer_Location  Rating
0         1    Laptop   Electronics          New York       5
1         2     Mouse   electronics                NY       4
2         3  Keyboard  ELECTRONICS           new york       5
3         4   Monitor    Home goods        California       3
4         5    Webcam    Home Goods                CA       4
5         6    Laptop   Electronics          New York       5
6         7     Mouse   ElecTRONICS                ca       4

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ReviewID           7 non-null      int64 
 1   Product            7 non-null      object
 2   Category           7 non-null      object
 3   Reviewer_Location  7 non-null      object
 4   Rating             7 non-null      int64 
dtypes: i

In [5]:
print("Unique values and counts in 'Category' (before cleaning):")
print(df_reviews['Category'].value_counts())

Unique values and counts in 'Category' (before cleaning):
Category
Electronics     2
electronics     1
ELECTRONICS     1
Home goods      1
Home Goods      1
ElecTRONICS     1
Name: count, dtype: int64


In [6]:
print("\nUnique values and counts in 'Reviewer_Location' (before cleaning):")
print(df_reviews['Reviewer_Location'].value_counts())



Unique values and counts in 'Reviewer_Location' (before cleaning):
Reviewer_Location
New York      2
NY            1
  new york    1
California    1
CA            1
ca            1
Name: count, dtype: int64


In [9]:
df_corrected_replace = df_reviews.copy()
df_corrected_replace['Category'] = df_corrected_replace['Category'].\
                                   replace('Home goods', 'Home Goods')

print("Category values after simple replace:")
print(df_corrected_replace['Category'].value_counts())


Category values after simple replace:
Category
Electronics     2
Home Goods      2
electronics     1
ELECTRONICS     1
ElecTRONICS     1
Name: count, dtype: int64


In [10]:
df_cleaned_text = df_reviews.copy()

In [22]:
df_cleaned_text['Category'] = df_cleaned_text['Category'].str.lower().str.strip()


In [23]:
df_cleaned_text['Reviewer_Location'] = df_cleaned_text['Reviewer_Location'].\
                                       str.lower().str.strip()

In [24]:
print("\nCategory values after lowercasing and stripping whitespace:")
print(df_cleaned_text['Category'].value_counts())

print("\nReviewer_Location values after lowercasing and stripping whitespace:")
print(df_cleaned_text['Reviewer_Location'].value_counts())


Category values after lowercasing and stripping whitespace:
Category
electronics    5
home goods     2
Name: count, dtype: int64

Reviewer_Location values after lowercasing and stripping whitespace:
Reviewer_Location
new york      3
ca            2
ny            1
california    1
Name: count, dtype: int64


In [28]:
location_standardization_mapping = {
    'ny': 'New York',
    'new york': 'New York', 
    'ca': 'California',
    'california': 'California' 
}

In [29]:
df_standardized_location = df_cleaned_text.copy() 
df_standardized_location['Reviewer_Location'] = \
df_standardized_location['Reviewer_Location'].replace(location_standardization_mapping)


In [30]:
print("\nReviewer_Location values after mapping abbreviations to full names:")
print(df_standardized_location['Reviewer_Location'].value_counts())
print("\nDataFrame after location standardization:")
print(df_standardized_location)



Reviewer_Location values after mapping abbreviations to full names:
Reviewer_Location
New York      4
California    3
Name: count, dtype: int64

DataFrame after location standardization:
   ReviewID   Product     Category Reviewer_Location  Rating
0         1    Laptop  electronics          New York       5
1         2     Mouse  electronics          New York       4
2         3  Keyboard  electronics          New York       5
3         4   Monitor   home goods        California       3
4         5    Webcam   home goods        California       4
5         6    Laptop  electronics          New York       5
6         7     Mouse  electronics        California       4


In [31]:
def standardize_product_name_custom(product_name):
    
    if not isinstance(product_name, str):
        return product_name 
    
    product_name = product_name.strip().capitalize() 
    
    if product_name in ['Laptop', 'Monitor']:
        return f"{product_name}_Electronic"
    elif product_name in ['Mouse', 'Keyboard', 'Webcam']:
        return f"{product_name}_Peripheral"
    else:
        return product_name 

In [None]:
df_custom_standardized = df_reviews.copy() 
df_custom_standardized['Product'] = df_custom_standardized['Product'].\
                                    apply(standardize_product_name_custom)

print("\nProduct names after applying custom standardization logic:")
print(df_custom_standardized['Product'].value_counts())
print("\nDataFrame after custom product standardization:")
print(df_custom_standardized)



Product names after applying custom standardization logic:
Product
Laptop_Electronic      2
Mouse_Peripheral       2
Keyboard_Peripheral    1
Monitor_Electronic     1
Webcam_Peripheral      1
Name: count, dtype: int64

DataFrame after custom product standardization:
   ReviewID              Product      Category Reviewer_Location  Rating
0         1    Laptop_Electronic   Electronics          New York       5
1         2     Mouse_Peripheral   electronics                NY       4
2         3  Keyboard_Peripheral  ELECTRONICS           new york       5
3         4   Monitor_Electronic    Home goods        California       3
4         5    Webcam_Peripheral    Home Goods                CA       4
5         6    Laptop_Electronic   Electronics          New York       5
6         7     Mouse_Peripheral   ElecTRONICS                ca       4
