In [1]:
import pandas as pd
import numpy as np


In [2]:
cust_df = pd.read_csv("/kaggle/input/original-customer/original_customer_feedback.csv")
cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,<b>Bad service</b>,1.0,2024-02-30 13:00:00
2,3,103,support_call,,3.0,2024-03-15 14:00:00
3,3,103,support_call,,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,,invalid_date
6,6,106,support_call,Terrible! <a href='http://example.com'>Read mo...,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,,1.0,2024-09-10 19:00:00


# TASK 1
## Remove duplicate rows based on the review_id column.

In [3]:
duplicates = cust_df[cust_df.duplicated(subset='review_id', keep=False)]

print("Duplicate rows based on 'review_id':")
duplicates

Duplicate rows based on 'review_id':


Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
2,3,103,support_call,,3.0,2024-03-15 14:00:00
3,3,103,support_call,,3.0,2024-03-15 14:00:00


In [4]:
# Remove duplicate rows based on the 'review_id' column
cust_df = cust_df.drop_duplicates(subset='review_id')
cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,<b>Bad service</b>,1.0,2024-02-30 13:00:00
2,3,103,support_call,,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,,invalid_date
6,6,106,support_call,Terrible! <a href='http://example.com'>Read mo...,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,,1.0,2024-09-10 19:00:00


# Task 2
## Fill missing feedback_text values with "No feedback provided".

In [5]:
# Fill missing feedback_text values with "No feedback provided"
cust_df.loc[cust_df['feedback_text'].isna(), 'feedback_text'] = 'No feedback provided'
cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,<b>Bad service</b>,1.0,2024-02-30 13:00:00
2,3,103,support_call,No feedback provided,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,,invalid_date
6,6,106,support_call,Terrible! <a href='http://example.com'>Read mo...,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,No feedback provided,1.0,2024-09-10 19:00:00


# Task 3
## Fill missing rating values with the median rating

In [6]:
# Calculate the median rating
median_rating = cust_df['rating'].median()

# Fill missing rating values with the median rating
cust_df.loc[cust_df['rating'].isna(), 'rating'] = median_rating

cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,<b>Bad service</b>,1.0,2024-02-30 13:00:00
2,3,103,support_call,No feedback provided,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,3.5,invalid_date
6,6,106,support_call,Terrible! <a href='http://example.com'>Read mo...,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,No feedback provided,1.0,2024-09-10 19:00:00


# Task4. 
## Remove HTML tags from the feedback_text column.

In [7]:
from bs4 import BeautifulSoup

# Function to remove HTML tags
def remove_html_tags(text):
    if pd.isna(text):
        return text
    return BeautifulSoup(text, "html.parser").get_text()

# Remove HTML tags from feedback_text column
cust_df.loc[:, 'feedback_text'] = cust_df['feedback_text'].apply(remove_html_tags)

cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,Bad service,1.0,2024-02-30 13:00:00
2,3,103,support_call,No feedback provided,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,3.5,invalid_date
6,6,106,support_call,Terrible! Read more,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,No feedback provided,1.0,2024-09-10 19:00:00


# Task 5
## Convert timestamp values to a consistent datetime format.

In [9]:
# Convert timestamp values to a consistent datetime format
cust_df.loc[:, 'timestamp'] = pd.to_datetime(cust_df['timestamp'], errors='coerce')
cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
1,2,102,survey,Bad service,1.0,NaT
2,3,103,support_call,No feedback provided,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
5,5,105,survey,Good value,3.5,NaT
6,6,106,support_call,Terrible! Read more,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,No feedback provided,1.0,2024-09-10 19:00:00


# Task 6
## Remove rows with invalid dates.

In [10]:
# Remove rows with invalid dates (NaT) in timestamp column
cust_df = cust_df.dropna(subset=['timestamp'])
cust_df

Unnamed: 0,review_id,customer_id,feedback_source,feedback_text,rating,timestamp
0,1,101,online_review,Great product!,5.0,2024-01-01 12:00:00
2,3,103,support_call,No feedback provided,3.0,2024-03-15 14:00:00
4,4,104,online_review,Okay experience,4.0,2024-04-10 15:00:00
6,6,106,support_call,Terrible! Read more,2.0,2024-06-20 16:00:00
7,7,107,online_review,Not bad,4.0,2024-07-25 17:00:00
8,8,108,survey,Excellent! Will buy again.,5.0,2024-08-30 18:00:00
9,9,109,support_call,No feedback provided,1.0,2024-09-10 19:00:00
