In [None]:
import sys
import pandas as pd
import numpy as np
import datetime as dt
from scipy import stats
import matplotlib.pyplot as plt


OUTPUT_TEMPLATE = (
    "Initial (invalid) T-test p-value: {initial_ttest_p:.3g}\n"
    "Original data normality p-values: {initial_weekday_normality_p:.3g} {initial_weekend_normality_p:.3g}\n"
    "Original data equal-variance p-value: {initial_levene_p:.3g}\n"
    "Transformed data normality p-values: {transformed_weekday_normality_p:.3g} {transformed_weekend_normality_p:.3g}\n"
    "Transformed data equal-variance p-value: {transformed_levene_p:.3g}\n"
    "Weekly data normality p-values: {weekly_weekday_normality_p:.3g} {weekly_weekend_normality_p:.3g}\n"
    "Weekly data equal-variance p-value: {weekly_levene_p:.3g}\n"
    "Weekly T-test p-value: {weekly_ttest_p:.3g}\n"
    "Mann–Whitney U-test p-value: {utest_p:.3g}"
)


def main():
    reddit_counts = sys.argv[1]
    counts = pd.read_json(reddit_counts, lines=True)
    counts['day_of_week'] = counts['date'].dt.dayofweek
    counts['is_weekends'] = (counts['day_of_week'] == 5) | (counts['day_of_week'] == 6)
    counts['date'].dt.year

    counts = counts[((counts['date'].dt.year == 2012) | (counts['date'].dt.year == 2013)) & (counts['subreddit'].str.lower() == 'canada')]
    weekday_counts = counts[~counts['is_weekends']].copy().reset_index(drop=True)
    weekend_counts = counts[counts['is_weekends']].copy().reset_index(drop=True)
    stats.normaltest(weekend_counts['comment_count']).pvalue
    stats.levene(weekend_counts['comment_count'], weekday_counts['comment_count'])

    #--------------Initial T Test-----------------

    #initial_ttest_p = 1.3005502847207912e-58 < 0.5
    #reject which means they are different
    #however, it is not normally distributed and equal variances, so we can not conclude the that.
    initial_ttest_p = stats.ttest_ind(weekend_counts['comment_count'], weekday_counts['comment_count']).pvalue

    #initial_weekday_normality_p = 1.0091137251707994e-07 > 0.05
    #failed to reject that it is normally distributed
    #it looks normally distributed
    initial_weekday_normality_p = stats.normaltest(weekday_counts['comment_count']).pvalue

    #initial_weekend_normality_p = 0.0015209196859635404 < 0.05
    #reject so it is not normally distributed
    initial_weekend_normality_p = stats.normaltest(weekend_counts['comment_count']).pvalue

    #initial_levene_p = 0.04378740989202803 < 0.05
    #reject so these two data sets has not equal variances
    initial_levene_p = stats.levene(weekend_counts['comment_count'], weekday_counts['comment_count']).pvalue

    #Therefore, we can not draw a conclusion for now

    #----------------Fix 1-----------------------

    weekday_counts = weekday_counts.join(weekday_counts['comment_count'].transform([np.log, np.exp, np.sqrt, lambda x: x ** 2]))
    weekend_counts = weekend_counts.join(weekend_counts['comment_count'].transform([np.log, np.exp, np.sqrt, lambda x: x ** 2]))

    transformed_weekday_normality_p = stats.normaltest(weekday_counts['sqrt']).pvalue

    #transformed_weekend_normality_p = 0.10760562894666933 > 0.05
    #not reject so it looks like normally distributed
    transformed_weekend_normality_p = stats.normaltest(weekend_counts['sqrt']).pvalue

    #transformed_levene_p = 0.5560544297516696 > 0.5
    #Not reject so it looks have the equally variances
    transformed_levene_p = stats.levene(weekend_counts['sqrt'], weekday_counts['sqrt']).pvalue

    #This time Weekends becomes normally distributed, however weekday does not
    #Fix 1 can not save us

    #----------------Fix 2-----------------------

    #Delete the useless datas
    weekday_counts = weekday_counts.drop(['log', 'exp', 'sqrt', '<lambda>'], axis=1)
    weekend_counts = weekend_counts.drop(['log', 'exp', 'sqrt', '<lambda>'], axis=1)

    def get_iso_year_week(date: dt.datetime):
        year, week, _ = date.isocalendar()
        return pd.Series([year,week], index=['isoyear', 'isoweek'])

    weekday_counts = weekday_counts.join(weekday_counts['date'].dt.date.apply(get_iso_year_week))
    weekend_counts = weekend_counts.join(weekend_counts['date'].dt.date.apply(get_iso_year_week))

    weekly_weekday_counts = weekday_counts.groupby(['isoyear','isoweek']).mean().reset_index()
    weekly_weekend_counts = weekend_counts.groupby(['isoyear','isoweek']).mean().reset_index()

    #weekly_weekday_normality_p = 0.3082637390825463 > 0.05
    #not reject so it looks like normally distributed
    weekly_weekday_normality_p = stats.normaltest(weekly_weekday_counts['comment_count']).pvalue

    #weekly_weekend_normality_p = 0.15294924717078442 > 0.05
    #not reject so it looks like normally distributed
    weekly_weekend_normality_p = stats.normaltest(weekly_weekend_counts['comment_count']).pvalue

    #weekly_levene_p = 0.20383788083573426 > 0.05
    #Not reject so it looks like equally variances
    weekly_levene_p = stats.levene(weekly_weekday_counts['comment_count'], weekly_weekend_counts['comment_count']).pvalue

    #weekly_ttest_p = 1.3353656052303144e-34 < 0.05
    #Reject so it seems that the comments of weekends and weekdays are different (weekly)
    weekly_ttest_p = stats.ttest_ind(weekly_weekday_counts['comment_count'], weekly_weekend_counts['comment_count']).pvalue

    #----------------Fix 3-----------------------
    #Delete the useless datas
    weekday_counts = weekday_counts.drop(['isoyear', 'isoweek'], axis=1)
    weekend_counts = weekend_counts.drop(['isoyear', 'isoweek'], axis=1)

    #utest_p = 4.3122266173669665e-53 < 0
    #reject, means it is not equally-likely that the larger number of comments occur on weekends vs weekdays.
    utest_p = stats.mannwhitneyu(weekend_counts['comment_count'], weekday_counts['comment_count']).pvalue

    # ...

    print(OUTPUT_TEMPLATE.format(
        initial_ttest_p=0,
        initial_weekday_normality_p=0,
        initial_weekend_normality_p=0,
        initial_levene_p=0,
        transformed_weekday_normality_p=0,
        transformed_weekend_normality_p=0,
        transformed_levene_p=0,
        weekly_weekday_normality_p=0,
        weekly_weekend_normality_p=0,
        weekly_levene_p=0,
        weekly_ttest_p=0,
        utest_p=0,
    ))


if __name__ == '__main__':
    main()