In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr
from scipy import stats
from IPython.display import display, HTML
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from datetime import datetime
from statsmodels.formula.api import ols
from statsmodels.stats.api import anova_lm
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 3)
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>div.output_scroll { height: unset; }</style>"))
display(HTML("<style>pre { white-space: pre !important; }</style>"))


In [None]:
#loading the dataset
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

df_trustpilot = pd.read_csv('data/data_trustpilot_2.csv', engine='python')

display(df_trustpilot.head())



In [None]:
#First check
df_trustpilot['local_date_posted'] = pd.to_datetime(df_trustpilot['local_date_posted'])

print(df_trustpilot.columns)
display(df_trustpilot.info())

STAGE 1 - HEATMAP

In [None]:
#Split set into numeric, categorical and datetime variables
#Provide statistical metrics for numeric variables

numeric_variables = df_trustpilot.select_dtypes(include = ['int', 'float'])
categorical_variables = df_trustpilot.select_dtypes(include=['object']) 
datetime_variables = df_trustpilot.select_dtypes(include=['datetime64'])

In [None]:
# Correlation heatmap for numeric variables

plt.figure(figsize=(10, 8))
sns.heatmap(numeric_variables.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Numeric Variables')
plt.show()

#Based on the heatmap, the only strong correlation that exists among the numeric variables is between rating and verification.

PEARSON AND SPEARMAN

In [None]:
#Pearson and Spearman

def analyze_correlation_with_rating(column_name):
    if column_name not in df_trustpilot.columns:
        return f"Error: Column '{column_name}' not found in dataframe"
    
    # Calculate correlations
    pearson_coeff, pearson_p = pearsonr(df_trustpilot['rating'], df_trustpilot[column_name])
    spearman_coeff, spearman_p = spearmanr(df_trustpilot['rating'], df_trustpilot[column_name])
    
    # Format output
    output = []
    output.append(f"Correlation Analysis: rating vs {column_name}")
    output.append("=" * 50)
    output.append("Hypothesis:")
    output.append("H0: The two variables are uncorrelated")
    output.append("H1: The two variables are correlated")
    output.append("\nResults:")
    output.append("-" * 50)
    
    # Pearson Analysis
    output.append(f"Pearson coefficient: {pearson_coeff}")
    if abs(pearson_coeff) < 0.1:
        strength = "extremely weak/negligible"
    elif abs(pearson_coeff) < 0.3:
        strength = "weak"
    elif abs(pearson_coeff) < 0.5:
        strength = "moderate"
    else:
        strength = "strong"
    direction = "negative" if pearson_coeff < 0 else "positive"
    output.append(f"Negative sign indicates an inverse relationship but as the value is {strength}, "
                 f"this indicates a{strength} {direction} correlation.")
    
    output.append(f"\nPearson p_value: {pearson_p}")
    if pearson_p < 0.05:
        output.append("Value is smaller than the common significance level of 0.05. "
                     f"This indicates that the correlation, although {strength}, is statistically significant.")
    else:
        output.append("Value is larger than the common significance level of 0.05. "
                     "This indicates that the correlation is not statistically significant.")
    
    # Spearman Analysis
    output.append(f"\nSpearman coefficient: {spearman_coeff}")
    if abs(spearman_coeff) < 0.1:
        strength = "extremely weak/negligible"
    elif abs(spearman_coeff) < 0.3:
        strength = "weak"
    elif abs(spearman_coeff) < 0.5:
        strength = "moderate"
    else:
        strength = "strong"
    output.append(f"Negative sign indicates an inverse relationship and as the value is {strength}, "
                 f"this indicates a {strength} {direction} correlation.")
    
    output.append(f"\nSpearman p_value: {spearman_p}")
    if spearman_p < 0.05:
        output.append("Value is smaller than the common significance level of 0.05. "
                     f"This indicates that the correlation, although {strength}, is statistically significant.")
    else:
        output.append("Value is larger than the common significance level of 0.05. "
                     "This indicates that the correlation is not statistically significant.")
    
    # Conclusion
    output.append("\nConclusion:")
    output.append("-" * 50)
    if pearson_p < 0.05 or spearman_p < 0.05:
        output.append("H0 is rejected")
        output.append("H1 is confirmed")
    else:
        output.append("Failed to reject H0")
        
    # Additional insights
    if abs(abs(spearman_coeff) - abs(pearson_coeff)) > 0.1:
        output.append(f"\nThe difference between Pearson ({pearson_coeff:.3f}) and "
                     f"Spearman ({spearman_coeff:.3f}) suggests a non-linear relationship")
    
    # Practical interpretation
    output.append(f"\nAs {column_name} increases, ratings tend to "
                 f"{'decrease' if pearson_coeff < 0 else 'increase'}")
    output.append("While statistically significant, the relationship is "
                 f"{strength}")
    output.append("Keep for model training of a rating model")
    
    return "\n".join(output)

# Usage
print(analyze_correlation_with_rating('days_between_experience_and_post'))

# Get list of numeric columns excluding 'rating'
numeric_columns = df_trustpilot.select_dtypes(include=['int64', 'float64']).columns
numeric_columns = [col for col in numeric_columns if col != 'rating']

print("Starting correlation analysis for all numeric variables...")
print("=" * 80)

# Loop through each numeric column
for column in numeric_columns:
    display(HTML(f"<pre>{analyze_correlation_with_rating(column)}</pre>"))
    display(HTML("<hr>")) 


ANOVA

In [None]:
#ANOVA

def analyze_anova_with_rating(df):
    # Get numeric columns except rating
    attributes = df.select_dtypes(include=['int64', 'float64']).columns
    attributes = [col for col in attributes if col != 'rating']
    
    # Store results and selected features
    results = {}
    feat_select = []
    
    print("ANOVA Analysis: Features vs Rating")
    print("=" * 50)
    
    # Analyze each feature
    for feature in attributes:
        try:
            # Perform ANOVA
            lm = ols('rating ~ {}'.format(feature), data=df).fit()
            table = anova_lm(lm)
            p_value = table['PR(>F)'].iloc[0]
            
            # If significant, add to selected features
            if p_value <= 0.05:
                feat_select.append(feature)
                results[feature] = p_value
                
        except Exception as e:
            print(f"Error analyzing {feature}: {str(e)}")
    
    # Create and display results table
    if feat_select:
        results_df = pd.DataFrame({
            'Feature': feat_select,
            'P-value': [results[f] for f in feat_select]
        }).sort_values('P-value')
        
        print("\nSelected Features (p ≤ 0.05):")
        print(results_df)
    
    return feat_select

# Usage:
selected_features = analyze_anova_with_rating(df_trustpilot)

In [None]:
#ANOVA 

def analyze_anova_with_rating(df):
    attributes = df.select_dtypes(include=['int64', 'float64']).columns
    attributes = [col for col in attributes if col != 'rating']
    
    results = {}
    feat_select = []
    output_text = []
    
    output_text.append("ANOVA Analysis: Features vs Rating")
    output_text.append("=" * 50)
    
    for feature in attributes:
        try:
            lm = ols('rating ~ {}'.format(feature), data=df).fit()
            table = anova_lm(lm)
            p_value = table['PR(>F)'].iloc[0]
            f_stat = table['F'].iloc[0]
            
            # Calculate effect size (eta-squared)
            ss_between = table['sum_sq'][0]
            ss_total = ss_between + table['sum_sq'][1]
            eta_squared = ss_between / ss_total
            
            # Store results
            results[feature] = {
                'p_value': p_value,
                'f_stat': f_stat,
                'eta_squared': eta_squared
            }
            
            # Interpret effect size
            if eta_squared < 0.01:
                effect_strength = "negligible"
            elif eta_squared < 0.06:
                effect_strength = "small"
            elif eta_squared < 0.14:
                effect_strength = "medium"
            else:
                effect_strength = "large"
            
            # Feature selection
            if p_value <= 0.05:
                feat_select.append(feature)
            
            # Output for each feature
            output_text.append(f"\nAnalysis for: {feature}")
            output_text.append("-" * 30)
            output_text.append(f"F-statistic: {f_stat:.4f}")
            output_text.append(f"P-value: {p_value:.4e}")
            output_text.append(f"Effect size (η²): {eta_squared:.4f}")
            output_text.append(f"Effect strength: {effect_strength}")
            output_text.append(f"Selected for model: {'Yes' if p_value <= 0.05 else 'No'}")
            
        except Exception as e:
            output_text.append(f"\nError analyzing {feature}: {str(e)}")
    
    # Summary
    output_text.append("\n" + "=" * 50)
    output_text.append("\nSummary:")
    output_text.append(f"Total features analyzed: {len(attributes)}")
    output_text.append(f"Features selected: {len(feat_select)}")
    output_text.append("\nSelected features:")
    for feat in feat_select:
        p_value = results[feat]['p_value']
        eta = results[feat]['eta_squared']
        output_text.append(f"- {feat}: p={p_value:.4e}, η²={eta:.4f}")
    
    print("\n".join(output_text))
    return results, feat_select

# Usage:
results, selected_features = analyze_anova_with_rating(df_trustpilot)

LOGISTIC REGRESSION WITH RIB RATIO STUDY

In [None]:
#Logistic regression with rib ratio study


def analyze_variable_importance(column_name):
    if column_name not in df_trustpilot.columns:
        return f"Error: Column '{column_name}' not found in dataframe"
    
    y = (df_trustpilot['rating'] >= 4).astype(int)
    
    X = df_trustpilot[column_name]
    
    if not np.issubdtype(X.dtype, np.number):
        return f"Error: Column '{column_name}' is not numeric. Please encode categorical variables first."
    
    X = X.values.reshape(-1, 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit logistic regression
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Calculate correlation (point biserial for binary vs continuous)
    correlation, p_value = stats.pointbiserialr(y, X.ravel())
    
    output = []
    output.append(f"Variable Importance Analysis: {column_name} vs High/Low Rating")
    output.append("=" * 60)
    
    # Basic statistics
    output.append("\nBasic Statistics:")
    output.append(f"Mean value for low ratings: {X[y==0].mean():.3f}")
    output.append(f"Mean value for high ratings: {X[y==1].mean():.3f}")
    output.append(f"Correlation coefficient: {correlation:.3f}")
    output.append(f"P-value: {p_value:.3e}")
    
    # Logistic Regression Results
    output.append("\nLogistic Regression Results:")
    output.append(f"Coefficient: {model.coef_[0][0]:.3f}")
    output.append(f"Intercept: {model.intercept_[0]:.3f}")
    output.append(f"ROC AUC Score: {roc_auc:.3f}")
    
    # Classification Report
    output.append("\nClassification Report:")
    output.append(classification_report(y_test, y_pred))
    
    # Effect interpretation
    output.append("\nInterpretation:")
    if abs(correlation) < 0.1:
        strength = "negligible"
    elif abs(correlation) < 0.3:
        strength = "weak"
    elif abs(correlation) < 0.5:
        strength = "moderate"
    else:
        strength = "strong"
    
    direction = "positive" if correlation > 0 else "negative"
    
    output.append(f"- The relationship is {strength} and {direction}")
    if p_value < 0.05:
        output.append("- The relationship is statistically significant")
        output.append(f"- As {column_name} increases, the likelihood of a high rating")
        output.append(f"  {'increases' if correlation > 0 else 'decreases'}")
    else:
        output.append("- The relationship is not statistically significant")
    
    output.append(f"- The model has an ROC AUC of {roc_auc:.3f}, indicating")
    if roc_auc < 0.6:
        output.append("  poor predictive power")
    elif roc_auc < 0.7:
        output.append("  fair predictive power")
    elif roc_auc < 0.8:
        output.append("  good predictive power")
    else:
        output.append("  excellent predictive power")
    
    output.append("\nKeep for model training: ")
    output.append("Yes" if (p_value < 0.05 and roc_auc > 0.6) else "Consider dropping")
    
    return "\n".join(output)

# Get numeric columns excluding rating
numeric_columns = df_trustpilot.select_dtypes(include=['int64', 'float64']).columns
numeric_columns = [col for col in numeric_columns if col != 'rating']

print("Starting variable importance analysis...")
print("=" * 80)

# Loop through each numeric column
for column in numeric_columns:
    display(HTML(f"<pre>{analyze_variable_importance(column)}</pre>"))
    display(HTML("<hr>"))