In [19]:
import pandas as pd

# Set file path
file_path = r'C:\Users\Dell\Desktop\HCAI5DS02_SauravKafle\venv\product_review_dataset.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip().str.lower()


In [20]:
print(df.columns.tolist())



['defective', 'highreturn', 'reviewrating', 'hascomplaint', 'verifiedpurchase']


In [21]:
#Prior Probability
p_defective = df['defective'].mean()
print("P(defective):", round(p_defective, 4))


P(defective): 0.0961


In [22]:
avg_def = df[df['defective'] == True]['reviewrating'].mean()
avg_nondef = df[df['defective'] == False]['reviewrating'].mean()

print(f"Average Rating (Defective): {avg_def:.2f}")
print(f"Average Rating (Non-Defective): {avg_nondef:.2f}")


Average Rating (Defective): 2.48
Average Rating (Non-Defective): 4.18


In [23]:


# P(highreturn | defective)
p_highreturn_given_def = df[df['defective'] == True]['highreturn'].mean()

# P(highreturn | non-defective)
p_highreturn_given_nondef = df[df['defective'] == False]['highreturn'].mean()

# P(highreturn)
p_highreturn = df['highreturn'].mean()

# P(defective)
p_defective = df['defective'].mean()

# Bayes: P(defective | highreturn)
p_def_given_highreturn = (p_highreturn_given_def * p_defective) / p_highreturn

print(f"P(HighReturn | Defective): {p_highreturn_given_def:.2%}")
print(f"P(HighReturn | Not Defective): {p_highreturn_given_nondef:.2%}")
print(f"P(Defective | HighReturn): {p_def_given_highreturn:.2%}")


P(HighReturn | Defective): 69.61%
P(HighReturn | Not Defective): 10.26%
P(Defective | HighReturn): 41.92%


In [24]:
p_high_given_def = df[df['defective'] == 1]['highreturn'].mean()
p_high_given_notdef = df[df['defective'] == 0]['highreturn'].mean()
p_high_return = df['highreturn'].mean()
p_def = df['defective'].mean()

# P(Defective | HighReturn)
p_def_given_high = (p_high_given_def * p_def) / p_high_return

print("2.1 P(HighReturn | Defective):", round(p_high_given_def, 4))
print("    P(HighReturn | Not Defective):", round(p_high_given_notdef, 4))
print("    P(Defective | HighReturn):", round(p_def_given_high, 4))


2.1 P(HighReturn | Defective): 0.6961
    P(HighReturn | Not Defective): 0.1026
    P(Defective | HighReturn): 0.4192


If P(Defective | HighReturn) ≈ 0.4192:
 Only ~41% of high-return items are defective.
So, most high returns are due to non-defect reasons like preference or wrong sizing.
Action: Do NOT recall based on high return alone. Use multiple factors.


In [25]:
def calculate_risk(row):
    score = 0
    if row['highreturn'] == 1:
        score += 1
    if row['reviewrating'] <= 2:
        score += 1
    if row['hascomplaint']:
        score += 1
    return score

df['riskscore'] = df.apply(calculate_risk, axis=1)


In [26]:
top_10_risk = df.sort_values(by='riskscore', ascending=False).head(10)
print(top_10_risk[['defective', 'highreturn', 'reviewrating', 'hascomplaint', 'verifiedpurchase', 'riskscore']])


      defective  highreturn  reviewrating  hascomplaint  verifiedpurchase  \
9060          1           1           1.9          True              True   
1024          1           1           1.5          True              True   
7115          1           1           2.0          True              True   
1220          1           1           1.6          True              True   
2701          1           1           1.9          True              True   
9584          1           1           1.0          True              True   
540           1           1           1.1          True              True   
240           1           1           1.6          True              True   
7784          1           1           2.0          True              True   
1121          1           1           1.8          True              True   

      riskscore  
9060          3  
1024          3  
7115          3  
1220          3  
2701          3  
9584          3  
540           3  
240     

## Decision Making


In [27]:
# Example product:
example_product = {
    'highreturn': 1,
    'reviewrating': 1.5,
    'hascomplaint': True,
    'verifiedpurchase': False
}

# Calculate risk score
risk_score = 0
if example_product['highreturn'] == 1:
    risk_score += 1
if example_product['reviewrating'] <= 2:
    risk_score += 1
if example_product['hascomplaint']:
    risk_score += 1

print("Risk Score for Example Product:", risk_score)
if risk_score >= 2:
    print("Recommend: QA check or consider recall")
else:
    print("Recommend: Monitor, no recall needed yet")


Risk Score for Example Product: 3
Recommend: QA check or consider recall


### Additional data that would help:
- Reason for return (e.g., size issue, broken, changed mind)
- Product category/type
- Supplier/manufacturer info
- Delivery damage reports
- Time since purchase
- Customer sentiment in reviews (via NLP)
