In [3]:
import pandas as pd

# Load the data
df = pd.read_csv('restaurant_reviews.csv')

# Show first few rows
print(df.head())



   review_id                                   review_text  rating  \
0          1    The pizza was amazing and arrived quickly!       5   
1          2     Delivery was slow, and the food was cold.       2   
2          3  Good food, okay service, average experience.       3   
3          4   Loved the pasta! Definitely ordering again.       5   
4          5     Horrible service, will never order again.       1   

   delivery_time  customer_loyalty_score  total_bill_amount sentiment  
0             25                      85                500  Positive  
1             45                      40                450  Negative  
2             35                      65                600   Neutral  
3             20                      90                550  Positive  
4             50                      30                400  Negative  


In [4]:
import statsmodels.api as sm
import scipy.stats as stats

# --- Regression Analysis: Predict Loyalty based on Delivery Time and Bill Amount ---
X = df[['delivery_time', 'total_bill_amount']]
y = df['customer_loyalty_score']

# Add constant
X = sm.add_constant(X)

# Build model
model = sm.OLS(y, X).fit()
print(model.summary())

# --- Hypothesis Testing: Does fast delivery (<30 mins) affect rating? ---
fast_delivery = df[df['delivery_time'] < 30]['rating']
slow_delivery = df[df['delivery_time'] >= 30]['rating']

# t-test
t_stat, p_value = stats.ttest_ind(fast_delivery, slow_delivery)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("✅ Fast delivery significantly impacts ratings!")
else:
    print("❌ No significant impact of delivery time on ratings.")


                              OLS Regression Results                              
Dep. Variable:     customer_loyalty_score   R-squared:                       0.966
Model:                                OLS   Adj. R-squared:                  0.956
Method:                     Least Squares   F-statistic:                     97.98
Date:                    Mon, 07 Apr 2025   Prob (F-statistic):           7.62e-06
Time:                            05:43:42   Log-Likelihood:                -28.285
No. Observations:                      10   AIC:                             62.57
Df Residuals:                           7   BIC:                             63.48
Df Model:                               2                                         
Covariance Type:                nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

  return hypotest_fun_in(*args, **kwds)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Feature and label
X = df['review_text']
y = df['sentiment']

# Vectorize text
vectorizer = TfidfVectorizer(max_features=100)
X_vectorized = vectorizer.fit_transform(X)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(classification_report(y_test, y_pred))


Accuracy: 33.33%
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         1
     Neutral       0.00      0.00      0.00         1
    Positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.11      0.33      0.17         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Create a new feature
df['is_fast_delivery'] = df['delivery_time'] < 30

# Group by fast delivery
summary = df.groupby('is_fast_delivery').agg({
    'rating': ['mean', 'count'],
    'total_bill_amount': 'mean',
    'customer_loyalty_score': 'mean'
}).reset_index()

# Save it for Tableau
summary.to_csv('tableau_summary.csv', index=False)

print("✅ Tableau data prepared and saved as 'tableau_summary.csv'.")


✅ Tableau data prepared and saved as 'tableau_summary.csv'.
