# Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

# 1. Import dataframe with top 10 restaurants

In [6]:
df  = pd.read_csv('./csv/filtered_reviews_clean.csv')
df.shape, df.columns

((570443, 3), Index(['business_id', 'stars', 'clean_text'], dtype='object'))

### 1.1. Merge dataset with res_id_lookup.csv

In [7]:
temp = pd.read_csv('./csv/res_id_lookup.csv')
df = df.merge(temp, on='business_id')
df.shape, df.columns

((52968, 4),
 Index(['business_id', 'stars', 'clean_text', 'name'], dtype='object'))

### 1.2. Load key complaints

In [8]:
# Load data
cumplidos = pd.read_csv('./csv/res_compliments.csv')
quejas = pd.read_csv('./csv/res_complaints.csv')

# 2. Regressions one for each restaurant

In [10]:
# Load data
cumplidos = pd.read_csv('./csv/res_compliments.csv')
quejas = pd.read_csv('./csv/res_complaints.csv')

# Loop over each business_id
counter = 0
for business_id in df['business_id'].unique():
    # Filter data for the current business_id
    business_df = df[df['business_id'] == business_id]
    name = business_df['name'].iloc[0] 
    
    # Filter compliments for this restaurant
    compliments_restaurant = cumplidos[cumplidos['business_id'] == business_id]
    compliments = compliments_restaurant['compliments'].tolist()
    for compliment in compliments:
        business_df[compliment] = business_df['clean_text'].str.contains(compliment).astype(int)

    # Filter complaints for this restaurant
    complaints_restaurant = quejas[quejas['business_id'] == business_id]
    complaints = complaints_restaurant['complaints'].tolist()
    for complaint in complaints:
        business_df[complaint] = business_df['clean_text'].str.contains(complaint).astype(int)

    # Remove columns with all zeros
    business_df = business_df.loc[:, (business_df != 0).any(axis=0)]

    # Define features and target variable for this restaurant
    X = business_df.drop(['clean_text', 'name', 'stars', 'business_id'], axis=1)  # Features
    y = business_df['stars']  # Target variable

    # Run regression for this restaurant
    model = LinearRegression()
    model.fit(X, y)
    r_squared= round(model.score(X, y),2)

    counter = counter + 1
    
    # Print state of the loop
    print(f'{counter}. Now running regression for restaurant {name}, R2:                                       {r_squared}')

    # Save coefficients into an Excel file in the folders csv/regressions
    coefficients = model.coef_
    feature_names = X.columns
    df_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient': [model.intercept_]})
    
    df_coefficients = pd.concat([intercept_row, df_coefficients ], ignore_index=True)
    df_coefficients['name'] = business_df['name'].iloc[0] 
    df_coefficients['business_id'] = business_id
    df_coefficients = df_coefficients[df_coefficients['Coefficient']<5]
    df_coefficients = df_coefficients[df_coefficients['Coefficient']>-5]
    
    df_coefficients = df_coefficients.sort_values("Coefficient", ascending=True)
    df_coefficients.to_csv(f'./csv/regressions/{name}_coeffs.csv', index=False)


1. Now running regression for restaurant Luke, R2:                                       0.35
2. Now running regression for restaurant Gumbo Shop, R2:                                       0.12
3. Now running regression for restaurant Commander's Palace, R2:                                       0.34
4. Now running regression for restaurant Royal House, R2:                                       0.5
5. Now running regression for restaurant Felix's Restaurant & Oyster Bar, R2:                                       0.44
6. Now running regression for restaurant Cochon, R2:                                       0.36
7. Now running regression for restaurant Mother's Restaurant, R2:                                       0.43
8. Now running regression for restaurant Oceana Grill, R2:                                       0.4
9. Now running regression for restaurant Acme Oyster House, R2:                                       0.37
10. Now running regression for restaurant Ruby Slipper - New Orl

In [12]:
import pandas as pd

# Load data
cumplidos = pd.read_csv('./csv/res_compliments.csv')
quejas = pd.read_csv('./csv/res_complaints.csv')

# Create an empty dataframe to store the concatenated dataframes
concatenated_df = pd.DataFrame()

# Loop over each business_id
for business_id in df['business_id'].unique():
    # Filter data for the current business_id
    business_df = df[df['business_id'] == business_id]
    name = business_df['name'].iloc[0] 
    
    # Filter compliments for this restaurant
    compliments_restaurant = cumplidos[cumplidos['business_id'] == business_id]
    compliments = compliments_restaurant['compliments'].tolist()
    for compliment in compliments:
        business_df[compliment] = business_df['clean_text'].str.contains(compliment).astype(int)

    # Filter complaints for this restaurant
    complaints_restaurant = quejas[quejas['business_id'] == business_id]
    complaints = complaints_restaurant['complaints'].tolist()
    for complaint in complaints:
        business_df[complaint] = business_df['clean_text'].str.contains(complaint).astype(int)

    # Remove columns with all zeros
    business_df = business_df.loc[:, (business_df != 0).any(axis=0)]

    # Define features and target variable for this restaurant
    X = business_df.drop(['clean_text', 'name', 'stars', 'business_id'], axis=1)  # Features
    y = business_df['stars']  # Target variable

    # Run regression for this restaurant
    model = LinearRegression()
    model.fit(X, y)
    r_squared = round(model.score(X, y), 2)

    # Print state of the loop
    print(f'Now running regression for restaurant {name}, R2: {r_squared}')

    # Save coefficients into an Excel file in the folders csv/regressions
    coefficients = model.coef_
    feature_names = X.columns
    df_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient': [model.intercept_]})
    
    df_coefficients = pd.concat([intercept_row, df_coefficients], ignore_index=True)
    df_coefficients['name'] = business_df['name'].iloc[0] 
    df_coefficients['business_id'] = business_id
    df_coefficients = df_coefficients[(df_coefficients['Coefficient'] < 5) & (df_coefficients['Coefficient'] > -5)]
    df_coefficients = df_coefficients.sort_values("Coefficient", ascending=True)
    
    concatenated_df = pd.concat([concatenated_df, df_coefficients], ignore_index=True)

# Save the concatenated dataframe to a CSV file
concatenated_df.to_csv('./csv/regressions/concatenated_coeffs.csv', index=False)

Now running regression for restaurant Luke, R2: 0.35
Now running regression for restaurant Gumbo Shop, R2: 0.12
Now running regression for restaurant Commander's Palace, R2: 0.34
Now running regression for restaurant Royal House, R2: 0.5
Now running regression for restaurant Felix's Restaurant & Oyster Bar, R2: 0.44
Now running regression for restaurant Cochon, R2: 0.36
Now running regression for restaurant Mother's Restaurant, R2: 0.43
Now running regression for restaurant Oceana Grill, R2: 0.4
Now running regression for restaurant Acme Oyster House, R2: 0.37
Now running regression for restaurant Ruby Slipper - New Orleans, R2: 0.4
