In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

class DataProcessor:
    def __init__(self, file):
        self.file = file
    
    def load_data(self):
        data = pd.read_csv(self.file, low_memory=False)
        return data
    
    def clean_data(self, data):
        cleaned_data = data.dropna()
        unnecessary_columns = ['Unnamed: 0', 'review_title', 'product_id']
        cleaned_data = cleaned_data.drop(unnecessary_columns, axis=1)
        return cleaned_data
    

class DataAnalyzer:
    def __init__(self, data):
        self.data = data

    def get_average_of_column(self, column):
        if column not in self.data.columns:
            raise ValueError(f"No such column named '{column}' in the dataset.")
        average = self.data[column].mean()
        return average

    def get_distribution_of_column(self, column):
        distribution = self.data[column].value_counts().sort_index()
        column_values = self.data[column]
        std = np.std(column_values)
        var = np.var(column_values)
        return distribution, std, var

    def get_median_of_column(self, column):
        median = self.data[column].median()
        return median

    def get_mode_of_column(self, column):
        mode = self.data[column].mode().values
        return mode
    def get_correlation(self,data):
        correlation= self.data.corr()
        return correlation
    
    
    
class DataVisualizer:
    def __init__(self, data):
        self.data = data

    def plot_line_chart(self, column1='total_feedback_count', column2='total_pos_feedback_count'):
        sns.lineplot(x=column1, y=column2, data=self.data,palette="bright")
        column1t = column1.replace("_", " ")
        column2t = column2.replace("_", " ")
        plt.xlabel(column1t)
        plt.ylabel(column2t)
        plt.title(f"{column1t} vs {column2t}")
        plt.savefig("output/linechart.jpg")
        plt.close()

    def plot_distribution(self, column1='rating', column2='total_feedback_count'):
        sns.barplot(x=column1, y=column2, data=self.data, ci=None,palette="Paired")
        column1t = column1.replace("_", " ")
        column2t = column2.replace("_", " ")
        plt.xlabel(column1t)
        plt.ylabel(column2t)
        plt.title(f"{column1t} vs {column2t}")
        plt.savefig("output/bar.jpg")
        plt.close()

    def plot_pie(self, column1='rating', column2='total_feedback_count'):
        colors = sns.color_palette('bright')[0:5]
        sums = self.data.groupby(self.data[column1])[column2].sum()
        plt.pie(sums, labels=sums.index,colors = colors,autopct='%.0f%%')
        column1t = column1.replace("_", " ")
        plt.title("Pie Chart " + column1t)
        plt.savefig("output/pie.jpg")
        plt.close()

    def plot_scatter(self, column1='total_feedback_count', column2='total_pos_feedback_count', hue=None):
        sns.scatterplot(x=column1, y=column2, hue='rating', data=self.data,palette="bright")
        column1t = column1.replace("_", " ")
        column2t = column2.replace("_", " ")
        plt.xlabel(column1t)
        plt.ylabel(column2t)
        plt.title(f"{column1t} vs {column2t}")
        plt.savefig("output/Scatterplot.jpg")
        plt.close()

        
        

processor = DataProcessor('data/reviews_1500_end.csv')
data = processor.clean_data(processor.load_data())
data2 = processor.load_data()

analyzer = DataAnalyzer(data)

#Write the column you want to Analyze
#Columns: author_id - rating - is_recommended - helpfulness - total_feedback_count - total_neg_feedback_count - total_pos_feedback_count - submission_time - skin_tone - eye_color - skin_type - hair_color - product_id - product_name - brand_name - price_usd

column = "rating"

try:
    average = analyzer.get_average_of_column(column)
    distribution, std, var = analyzer.get_distribution_of_column(column)
    median = analyzer.get_median_of_column(column)
    mode = analyzer.get_mode_of_column(column)
    correlation = analyzer.get_correlation(data)

    print(f"""
Average: {average}\n
{column} distribution:\n{distribution}
\nStandard Deviation: {std}\n
Variance: {var}\n
Median: {median}\n
Mode: {mode}\n
Correlation: {correlation}""")
    
except ValueError as e:
    print(e)
    
visualizer = DataVisualizer(data)
visualizer.plot_line_chart()
visualizer.plot_distribution()
visualizer.plot_pie()
visualizer.plot_scatter()



Average: 4.0830651268458915

rating distribution:
1     2182
2     1485
3     1716
4     2758
5    12987
Name: rating, dtype: int64

Standard Deviation: 1.3782678750528803

Variance: 1.8996223354027821

Median: 5.0

Mode: [5]

Correlation:                             rating  is_recommended  helpfulness  \
rating                    1.000000        0.891003     0.182018   
is_recommended            0.891003        1.000000     0.170456   
helpfulness               0.182018        0.170456     1.000000   
total_feedback_count     -0.136903       -0.123738    -0.000465   
total_neg_feedback_count -0.212435       -0.193956    -0.301772   
total_pos_feedback_count -0.076629       -0.068403     0.132293   
price_usd                 0.012657        0.009186    -0.063769   

                          total_feedback_count  total_neg_feedback_count  \
rating                               -0.136903                 -0.212435   
is_recommended                       -0.123738                 -0.1939