In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Descriptive Statistics

In [None]:
wordpress = pd.read_csv('data/wordpress.csv',index_col='qid')
drupal = pd.read_csv('data/drupal.csv',index_col='qid')

In [13]:
print(wordpress.drop(columns=['dayofweek']).describe().to_markdown())
print(drupal.drop(columns=['dayofweek']).describe().to_markdown())

|       |   activity |   exposure |        response |        endurance |
|:------|-----------:|-----------:|----------------:|-----------------:|
| count | 6075       |   6075     |  4594           |   6075           |
| mean  |    2.16247 |    380.268 |     1.11131e+06 |      3.27548e+06 |
| std   |    3.70932 |   1142.33  |     5.04794e+06 |      1.01582e+07 |
| min   |    0       |      8     |     0           |      0           |
| 25%   |    1       |     43     |  1541.5         |   1504           |
| 50%   |    2       |    114     |  6261           |  12294           |
| 75%   |    3       |    371     | 53691.5         | 154420           |
| max   |  214       |  49000     |     6.29331e+07 |      6.61956e+07 |
|       |   activity |   exposure |         response |        endurance |
|:------|-----------:|-----------:|-----------------:|-----------------:|
| count |  228       |    228     |    166           |    228           |
| mean  |    1.75    |    256.127 |      1.44988

# Pie Charts

In [None]:
def valuecounts(df1,df2,col,bins,labels):
    """

    :param df1: wordpress
    :param df2: drupal
    :param col: column to count
    :param bins: ascending list
    :return: df of value counts
    """
    df = pd.DataFrame()
    df['wordpress'] = pd.cut(df1[col],bins=bins,right=False,labels=labels).value_counts()
    df['drupal'] = pd.cut(df2[col],bins=bins,right=False,labels=labels).value_counts()
    df.sort_index(inplace=True)

    r1,r2 = df1[df1[col]>bins[-1]][col].count(),df2[df2[col]>bins[-1]][col].count()
    rest = pd.DataFrame([[r1,r2]],columns=df.columns,index=['more'])

    return rest.append(df)

In [15]:
col2bins = {
    'activity':[0,1,2,3,4,5,6],
    'exposure':[0,20,100,500],
    'dayofweek':[0,1,2,3,4,5,6,7],
    'response':[0,3600,21600,86400,259200,604800],
    'endurance':[0,3600,21600,86400,259200,604800]
}

col2labels = {
    'activity':['0','1','2','3','4','5'],
    'exposure':['limited(0-20)','low(20-100)','average(100-500)'],
    'dayofweek':['Mon','Tue','Wed','Thur','Fri','Sat','Sun'],
    'response':['1 hour','3 hours','1 day','3 days','1 week'],
    'endurance':['1 hour','3 hours','1 day','3 days','1 week']
}

for col in wordpress.columns:
    df = valuecounts(wordpress,drupal,col,col2bins[col],col2labels[col])
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
    fig.add_trace(go.Pie(labels=df.index, values=df['wordpress'], name=f"WordPress {col}",sort=False),1, 1)
    fig.add_trace(go.Pie(labels=df.index, values=df['drupal'], name=f"Drupal {col}",sort=False),1, 2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name")

    fig.update_layout(
        annotations=[dict(text='WordPress', x=0.18, y=0.5, font_size=10, showarrow=False),
                    dict(text='Drupal', x=0.82, y=0.5, font_size=10, showarrow=False)]
    )
    fig.show()
    fig.write_image(f'./img/pie_{col}.png')

# Correlation Analysis

In [25]:
# one-hot encoding
# wordpress_onehot = pd.concat([wordpress,pd.get_dummies(wordpress.dayofweek,prefix='day_')],axis=1)
# wordpress_onehot.drop(['dayofweek'],inplace=True,axis=1)
# drupal_onehot = pd.concat([drupal,pd.get_dummies(drupal.dayofweek,prefix='day_')],axis=1)
# drupal_onehot.drop(['dayofweek'],inplace=True,axis=1)

In [32]:
# cm1 = wordpress_onehot.corr(method='spearman')
# cm2 = drupal_onehot.corr(method='spearman')

In [33]:
wordpress['weekend']=wordpress.dayofweek>=5
drupal['weekend']=drupal.dayofweek>=5

cm1 = wordpress.corr()
cm2 = drupal.corr()

In [44]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'xy'}]])
fig.add_trace(go.Heatmap(z=cm1,x=wordpress.columns,y=wordpress.columns),1, 1)
fig.add_trace(go.Heatmap(z=cm2,x=drupal.columns,y=drupal.columns),1, 2)
# fig.update_traces(hoverinfo="label+percent+name")

fig.show()
fig.write_image(f'./img/heatmap_correlation.png')

### (exposure, activity)

In [45]:
# drop outliers
wordpress = wordpress[wordpress['exposure']<=500]
drupal = drupal[drupal['exposure']<=500]

wordpress = wordpress[wordpress['endurance']<=86400*7]
drupal = drupal[drupal['endurance']<=86400*7]

In [51]:
wordpress['color'] = 'WordPress'
drupal['color'] = 'Drupal'
df = pd.concat([wordpress,drupal])
fig = px.scatter(df, x="exposure", y="activity", color="color",marginal_x="histogram", marginal_y="histogram")
fig.show()
fig.write_image('img/scatter(ex-act).png')

### (endurance, response)

In [52]:
fig = px.scatter(df, x="endurance", y="response", color="color",marginal_x="histogram", marginal_y="histogram")
fig.show()
fig.write_image('img/scatter(end-res).png')

# Significance Test

In [55]:
from scipy import stats

wordpress = pd.read_csv('data/wordpress.csv',index_col='qid')
drupal = pd.read_csv('data/drupal.csv',index_col='qid')

for col in wordpress.columns:
    if col == 'dayofweek':
        continue
    print(col)
    print(stats.kruskal(wordpress[col].dropna(),drupal[col].dropna()))

activity
KruskalResult(statistic=6.372753054875913, pvalue=0.011588567490143718)
exposure
KruskalResult(statistic=0.003667487312050593, pvalue=0.9517098032545275)
response
KruskalResult(statistic=58.116648795214616, pvalue=2.4702682328584256e-14)
endurance
KruskalResult(statistic=21.0361280046081, pvalue=4.507041160108326e-06)
