In [23]:
import os
import pandas as pd
import numpy as np
import json
from collections import Counter
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from textblob import TextBlob
import joblib

In [24]:
class ReviewDataAnalysis:
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = pd.read_csv(data_path)
        self.data['date'] = pd.to_datetime(self.data['date'], format='%d-%b-%y')


    def calculate_numeric_stats(self):
        """
        Numeric data distribution analysis
        """
        df = self.data.copy()
        numeric_cols = ['rating', 'usefulCount']
        result = {}
        
        for col in numeric_cols:
            stats = {
                'mean': round(df[col].mean(), 2),
                'median': df[col].median(),
                'std': round(df[col].std(), 2),
                'min': df[col].min(),
                'max': df[col].max(),
                'skewness': round(df[col].skew(), 2),
                'outliers': len(df[df[col] > df[col].quantile(0.99)])
            }
            result[col] = stats
        
        return {'numeric_distribution': result}


    def analyze_categorical_data(self):
        """
        Categorical data distribution analysis
        """
        df = self.data.copy()
        cat_cols = ['drugName', 'condition', 'Product Class']
        result = {}
        
        for col in cat_cols:
            if col == 'Product Class':
                if 'Else' in df[col].unique():
                    unique_values = df[col].nunique() - 1
                else:
                    unique_values = df[col].nunique()
            else:
                unique_values = df[col].nunique()
            stats = {'unique_count': unique_values}
            
            if unique_values < 10:
                if col == 'Product Class':
                    # ignore 'Else' category
                    filtered_counts = df[df[col] != 'Else'][col].value_counts().to_dict()
                    stats['all_values_except_else'] = filtered_counts
                else:
                    stats['all_values'] = df[col].value_counts().to_dict()
            else:
                if col == 'Product Class':
                    # ignore 'Else' category
                    filtered_counts = df[df[col] != 'Else'][col].value_counts().head(5).to_dict()
                    stats['top_5_except_else'] = filtered_counts
                else:
                    stats['top_5'] = df[col].value_counts().head(5).to_dict()
            
            result[col] = stats
        
        return {'categorical_distribution': result}
    
    def process_time_series(self):
        """
        Time distribution
        """
        df = self.data.copy()
        result = {
            'time_span': f"{df['date'].min().date()} ~ {df['date'].max().date()}",
            'records_by_year': df['date'].dt.year.value_counts().to_dict(),
            'records_by_month': df['date'].dt.month_name().value_counts().to_dict()
        }
        return {'temporal_distribution': result}
    
    
    def _calculate_drug_rating_stats(self):
        """
        Base drug rating statistics
        """
        df = self.data.copy()
        grouped = df.groupby('drugName')['rating']
        stats_df = grouped.agg(['mean', 'count', 'median']).reset_index()
        stats_df = stats_df.rename(columns={'mean': 'avg_rating', 'count': 'review_count', 'median': 'median_rating'})
        
        top_10_drugs = stats_df.sort_values(by='review_count', ascending=False).head(10)
        
        result = {
            'global_summary': {
                'highest_avg': round(stats_df['avg_rating'].max(), 2),
                'lowest_avg': round(stats_df['avg_rating'].min(), 2),
                'total_drugs': len(stats_df),
                'total_reviews': int(stats_df['review_count'].sum())
            },
            'drug_rating_count_top10': top_10_drugs.set_index('drugName').to_dict('index')
        }
        return {'drug_rating_stats': result}
    
    def _analyze_usefulcount_correlation(self):
        """
        UsefulCount-rating correlation analysis
        :return: 
        """
        df = self.data.copy()
        corr_value = round(df['rating'].corr(df['usefulCount']), 3)
        
        bins = [0, 5, 7, 10]
        labels = ['0-5', '5-7', '7-10']
        
        df['rating_range'] = pd.cut(df['rating'], bins, labels=labels)
        grouped = df.groupby('rating_range', observed=True)['usefulCount']
        
        segment_stats = grouped.agg(['mean', 'count']).rename(
            columns={'mean': 'avg_usefulCount', 'count': 'record_count'}
        )
        
        segment_dict = {}
        for idx, row in segment_stats.iterrows():
            segment_dict[str(idx)] = {
                'avg_usefulCount': float(row['avg_usefulCount']),
                'record_count': int(row['record_count'])
            }
        
        result = {
            'correlation_coefficient': corr_value,
            'rating_segment_analysis': segment_dict,
            'extreme_cases': {
                'high_rating_low_useful': df[(df['rating'] >= 8) & (df['usefulCount'] <= 1)].shape[0],
                'low_rating_high_useful': df[(df['rating'] <= 3) & (df['usefulCount'] >= 50)].shape[0]
            }
        }
        return {'usefulcount_analysis': result}
    
    def _identify_extreme_rating_drugs(self, n=5):
        """
        Extreme rating drugs analysis
        """
        df = self.data.copy()
        avg_ratings = df.groupby('drugName')['rating'].mean()
        
        result = {
            'top_high_rating': avg_ratings.nlargest(n).to_dict(),
            'top_low_rating': avg_ratings.nsmallest(n).to_dict(),
            'controversial_drugs': df.groupby('drugName')['rating'].std()
                                      .nlargest(n).to_dict()  # Drugs with the largest standard deviation in ratings
        }
        return {'extreme_rating_drugs': result}
    
    def perform_drug_rating_analysis(self):
        """
        Drug-rating analysis 
        """
        result = {}
        result.update(self._calculate_drug_rating_stats())
        result.update(self._analyze_usefulcount_correlation())
        result.update(self._identify_extreme_rating_drugs())
        return {'drug_rating_analysis': result}
    

    def _calculate_condition_frequency(self):
        """
        High frequency condition analysis 
        """
        df = self.data.copy()
        df_clean = df.dropna(subset=['condition'])
        condition_counts = df_clean['condition'].value_counts()
        
        result = {}
        for condition, count in condition_counts.head(10).items():
            sub_df = df_clean[df_clean['condition'] == condition]
            result[condition] = {
                'total_reviews': int(count),
                'avg_rating': round(sub_df['rating'].mean(), 2),
                'related_drugs_count': sub_df['drugName'].nunique(),
                'top_drugs': sub_df['drugName'].value_counts().head(3).to_dict()
            }
        return {'condition_frequency': result}
    
    def _analyze_condition_associations(self):
        """
        Condition associations analysis
        """
        df = self.data.copy()
        df_clean = df.dropna(subset=['condition'])
        drug_conditions = df_clean.groupby('drugName')['condition'].unique()
        
        # co-occurrence analysis
        co_occurrence = Counter()
        for conditions in drug_conditions:
            for pair in combinations(sorted(conditions), 2):
                co_occurrence[pair] += 1
        
        # top 15 pairs
        top_pairs = dict(co_occurrence.most_common(15))
        return {'condition_associations': {
            'most_common_pairs': {f"{k[0]} & {k[1]}": v for k, v in top_pairs.items()},
            'multi_condition_drugs': sum(len(c) > 1 for c in drug_conditions)
        }}
    
    def _identify_condition_rating_extremes(self):
        """
        Condition rating extremes analysis
        """
        df = self.data.copy()
        df_clean = df.dropna(subset=['condition'])
        grouped = df_clean.groupby('condition')['rating'].agg(['mean', 'count', 'std'])
        
        return {
            'condition_rating_extremes': {
                'highest_rated': grouped[grouped['count'] >= 10].nlargest(5, 'mean').to_dict('index'),
                'lowest_rated': grouped[grouped['count'] >= 10].nsmallest(5, 'mean').to_dict('index'),
                'most_controversial': grouped[grouped['count'] >= 10].nlargest(5, 'std').to_dict('index')
            }
        }
    
    def perform_condition_analysis(self):
        """
        Condition analysis 
        """
        result = {}
        result.update(self._calculate_condition_frequency())
        result.update(self._analyze_condition_associations())
        result.update(self._identify_condition_rating_extremes())
        return {'condition_analysis': result}
    
    
    def _preprocess_rating_data(self):
        """
        Preprocess the rating data
        """
        df = self.data.copy()
        df_clean = df.copy()
        # filter outliers
        df_clean = df_clean.dropna(subset=['condition', 'Product Class', 'rating'])
        # filter Else
        df_clean = df_clean[df_clean['Product Class'] != 'Else']
        return df_clean
    
    def _extract_temporal_features(self, df):
        """
        Time series feature extraction
        """
        df_temp = df.copy()
        df_temp['year'] = pd.to_datetime(df['date']).dt.year
        df_temp['month'] = pd.to_datetime(df['date']).dt.month
        return df_temp[['year', 'month']]
    
    def _calculate_sentiment_scores(self, df):
        """
        Sentiment score calculation using TextBlob
        """
        df_temp = df.copy()
        df_temp['sentiment'] = df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.polarity
        )
        return df_temp[['sentiment']]
    
    def _build_feature_matrix(self, df):
        """
        construct feature matrix for rating prediction
        :param df: preprocessed DataFrame
        :return: feature matrix
        """
        # base features
        base_features = pd.DataFrame({
            'usefulCount': df['usefulCount'],
            'review_length': df['review'].str.len()
        }, index=df.index)
        
        # encode drugName and condition
        encoder_drug = OneHotEncoder(sparse_output=False)
        encoded_drug = pd.DataFrame(
            encoder_drug.fit_transform(df[['drugName']]),
            columns=[f"drug_{x}" for x in encoder_drug.categories_[0]],
            index=df.index
        )
        encoder_condition = OneHotEncoder(sparse_output=False)
        encoded_condition = pd.DataFrame(
            encoder_condition.fit_transform(df[['condition']]),
            columns=[f"cond_{x}" for x in encoder_condition.categories_[0]],
            index=df.index
        )
        
        feature_matrix = pd.concat([
            base_features,
            self._extract_temporal_features(df),
            self._calculate_sentiment_scores(df),
            encoded_drug.iloc[:, :10],  # top 10 frequent drugs
            encoded_condition.iloc[:, :5]  # top 5 frequent conditions
        ], axis=1)
        
        # merge all features
        return feature_matrix
    
    
    def _train_rating_prediction_model(self, X, y):
        """
        Train a Random Forest model for rating prediction
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
        model.fit(X_train, y_train)
        
        # save the model
        model_dir = 'analysis_result/Review-data'
        joblib.dump(model, f'{model_dir}/rating_prediction_model.pkl')
        with open(f'{model_dir}/feature_columns.json', 'w') as f:
            json.dump({'feature_columns': list(X.columns)}, f)
        
        preds = model.predict(X_test)
        return {
            'model_object': model,
            'performance': {
                'r2_score': round(r2_score(y_test, preds), 3),
                'mse': round(mean_squared_error(y_test, preds), 2)
            },
            'feature_importance': dict(zip(
                X.columns, 
                model.feature_importances_.round(3)
            ))
        }
    
    def perform_rating_prediction_analysis(self):
        """
        Rating prediction analysis
        """
        df = self.data.copy()
        df_clean = self._preprocess_rating_data()
        X = self._build_feature_matrix(df_clean)
        y = df_clean['rating']
        
        # filter features
        X_filtered = X.loc[:, X.columns.isin([
            'usefulCount', 'review_length', 'year', 'month', 'sentiment'
        ])]
        model_results = self._train_rating_prediction_model(X_filtered, y)
        return {'rating_prediction': {
            'data_stats': {
                'original_samples': len(df),
                'cleaned_samples': len(df_clean),
                'feature_dimension': X_filtered.shape[1]
            },
            'model_performance': model_results['performance'],
            'top_features': dict(sorted(
                model_results['feature_importance'].items(),
                key=lambda x: x[1], 
                reverse=True
            )[:])
        }}
    
    def analyze_all(self):
        result = {}
        result.update(self.calculate_numeric_stats())
        result.update(self.analyze_categorical_data())
        result.update(self.process_time_series())
        result.update(self.perform_drug_rating_analysis())
        result.update(self.perform_condition_analysis())
        result.update(self.perform_rating_prediction_analysis())
        
        return result



In [25]:
review_analysis = ReviewDataAnalysis('../data/drugsComTrain_raw_addclass.csv')
review_analysis_result = review_analysis.analyze_all()

if not os.path.exists('analysis_result/Review-data'):
    os.makedirs('analysis_result/Review-data')

with open(f'analysis_result/Review-data/review_analysis_result.json', 'w') as f:
    json.dump(review_analysis_result, f, indent=4)