In [9]:
from pathlib import Path
import json
from datetime import datetime
from collections import namedtuple

import pandas as pd  # pip install pandas
import numpy as np  # pip install numpy
import matplotlib.pyplot as plt  # pip install matplotlib
from scipy import stats
from scipy.stats import pearsonr  # pip install scipy
from sympy.physics.units import years
from tqdm.notebook import tqdm  # pip install tqdm
import plotly.express as px
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw  # pip install fastdtw

In [10]:
# ToDo list
# Change average per month to a rolling average or average per week
# Create regression model for average scores for a single app
# Create a function to select specific time frames
# Create a function to compare regression results between multiple apps
# Create a function to select apps for analysis

df_lichess = pd.read_json('../DATAR/release_related/all_reviews/lichess-org-_-lichobile.json')
df_proton= pd.read_json('../DATAR/release_related/all_reviews/ProtonVPN-_-android-app.json')

In [11]:
def preprocess_plotting(df: pd.DataFrame):
    """
    Takes a review dataframe and return a preprocessed copy for plotting
    """
    df_scores_time = df[['score', 'at']].copy()  # Create a copy of the DataFrame to avoid view issues
    # Adds a column to keep counts during grouping
    df_scores_time['count'] = 1
    # Add year and month to group on
    df_scores_time['year'] = pd.to_datetime(df_scores_time['at']).dt.year
    df_scores_time['month'] = pd.to_datetime(df_scores_time['at']).dt.month
    # Group by year -> month -> score
    df_scores_time = df_scores_time.groupby(by=['year', 'month', 'score']).sum(numeric_only=True)
    df_grouped_scores_time = pd.DataFrame(df_scores_time['count'].index.to_list(), columns=['year', 'month', 'score'])
    # Keep only the values
    df_grouped_scores_time['count'] = df_scores_time['count'].values
    # Add a datetime column to the values
    df_grouped_scores_time['day'] = 1
    df_grouped_scores_time['date'] = pd.to_datetime(df_grouped_scores_time[['day', 'month', 'year']])
    return df_grouped_scores_time

In [12]:
def calculate_average(df: pd.DataFrame):
    """
    Takes a plotting dataframe and adds the average per month
    """
    # Initialize variables
    dict_average = {}
    Old_row_tuple = namedtuple('old_row', ['month', 'year'])
    old_row = Old_row_tuple(0, 0)
    total_score = 0
    total_reviews = 0

    for index, row in df.iterrows():
        # Is true if the new row is of the same month
        if (row.month, row.year) == (old_row.month, old_row.year):
            total_score += row.score * row['count']
            total_reviews += row['count']
            dict_average[(row.month, row.year)] = (total_score, total_reviews, total_score/total_reviews, row['date'])
            old_row = row
        # If the new row is the next month, create a new total score and total review
        else:
            total_score = row.score * row['count']
            total_reviews = row['count']
            dict_average[(row.month, row.year)] = (total_score, total_reviews, total_score/total_reviews, row['date'])
            old_row = row
            
    # Turns the dictionary into a dataframe        
    df_average = pd.DataFrame(dict_average).T
    df_average.columns = ['total score', 'total amount of reviews', 'average score', 'date']

    return df_average


In [13]:
df_grouped = preprocess_plotting(df_lichess)
df_grouped

Unnamed: 0,year,month,score,count,day,date
0,2015,2,2,2,1,2015-02-01
1,2015,2,3,1,1,2015-02-01
2,2015,2,4,5,1,2015-02-01
3,2015,2,5,30,1,2015-02-01
4,2015,3,3,3,1,2015-03-01
...,...,...,...,...,...,...
511,2023,12,1,7,1,2023-12-01
512,2023,12,2,2,1,2023-12-01
513,2023,12,3,1,1,2023-12-01
514,2023,12,4,3,1,2023-12-01


In [14]:
df_average = calculate_average(df_grouped)
df_average

Unnamed: 0,Unnamed: 1,total score,total amount of reviews,average score,date
2,2015,177,38,4.657895,2015-02-01
3,2015,72,16,4.5,2015-03-01
4,2015,47,11,4.272727,2015-04-01
5,2015,62,14,4.428571,2015-05-01
6,2015,49,10,4.9,2015-06-01
...,...,...,...,...,...
8,2023,592,179,3.307263,2023-08-01
9,2023,532,172,3.093023,2023-09-01
10,2023,525,157,3.343949,2023-10-01
11,2023,480,155,3.096774,2023-11-01


In [25]:
avg1 = calculate_average(preprocess_plotting(df_lichess))['average score'].to_numpy()
avg2 = calculate_average(preprocess_plotting(df_proton))['average score'].to_numpy()
print(fastdtw(avg1, avg2, dist=2))  # dist=2 = euclidean

(17.8170382196829, [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 0), (58, 0), (59, 0), (60, 1), (61, 2), (62, 2), (63, 2), (64, 3), (65, 4), (66, 4), (67, 4), (68, 4), (69, 4), (70, 4), (71, 4), (72, 4), (73, 4), (74, 4), (75, 4), (76, 4), (77, 4), (78, 4), (79, 5), (80, 6), (81, 7), (81, 8), (81, 9), (82, 10), (82, 11), (83, 12), (83, 13), (83, 14), (83, 15), (83, 16), (83, 17), (83, 18), (84, 19), (85, 19), (86, 20), (86, 21), (86, 22), (86, 23), (86, 24), (86, 25), (86, 26), (86, 27), (86, 28), (87, 29), (87, 30), (88, 31), (88,

In [8]:
raise Exception('test')

Exception: test

In [54]:
df_plot_1 = preprocess_plotting(df_lichess)
df_plot_2 = preprocess_plotting(df_proton)

In [61]:
fig = px.histogram(df_plot_1, x = 'date', y = 'count', barmode = 'group', color = 'score')
fig.show()

In [56]:
fig = px.histogram(df_plot_2, x = 'date', y = 'count', barmode = 'group', color = 'score')
fig.show()

In [63]:
fig = px.scatter(df_average, x='date', y='average score', trendline='ols')
fig.show()