In [2]:
import pandas as pd
import dask.dataframe as dd

import imp

prepare_data = imp.load_source('prepare_data', '../src/prepare_data.py')
process_data = imp.load_source('processe_data', '../src/process_data.py')

In [3]:
import statsmodels.api as sm

In [4]:
features = [
    "CC_to_C_rate",
    "CD_to_C_rate",
    "DC_to_C_rate",
    "DD_to_C_rate",
    "SSE",
    "Cooperation_rating",
    "Cooperation_rating_max",
    "Cooperation_rating_min",
    "Cooperation_rating_median",
    "Cooperation_rating_mean",
    "Cooperation_rating_comp_to_max",
    "Cooperation_rating_comp_to_min",
    "Cooperation_rating_comp_to_median",
    "Cooperation_rating_comp_to_mean",
    "repetitions",
    "size",
]

In [5]:
features_labels = {
    "CC_to_C_rate": "$CC$ to $C$ rate",
    "CD_to_C_rate": "$CD$ to $C$ rate",
    "DC_to_C_rate": "$DC$ to $C$ rate",
    "DD_to_C_rate": "$DD$ to $C$ rate",
    "SSE": "SSE",
    "Makes_use_of_game": "Make use of game",
    "Makes_use_of_length": "Make use of length",
    "Stochastic": "stochastic",
    "Cooperation_rating": r"$C_r$",
    "Cooperation_rating_max": r"$C_{max}$",
    "Cooperation_rating_min": r"$C_{min}$",
    "Cooperation_rating_median": r"$C_{median}$",
    "Cooperation_rating_mean": r"$C_{mean}$",
    "Cooperation_rating_comp_to_max": r"$C_r$ / $C_{max}$ ",
    "Cooperation_rating_comp_to_min": r"$C_r$ / $C_{min}$",
    "Cooperation_rating_comp_to_median": r"$C_r$ / $C_{median}$",
    "Cooperation_rating_comp_to_mean": r"$C_r$ / $C_{mean}$",
    "turns": r"$n$",
    "noise": r"$p_n$",
    "probend": r"$p_e$",
    "Normalized_Rank": r"$r$",
    "Median_score": "Median score",
    "size": r"$N$",
    "memory_usage": "memory usage",
    "repetitions": r"$k$",
}

In [6]:
dfs = []
for name in ['standard', 'noise', 'probend', 'probend_noise']:
    df = pd.read_csv('../data/%s_3_processed.csv' % name, index_col=0
                           )
    df['type'] = name
    dfs.append(df)

  mask |= (ar1 == a)


In [7]:
types = ['standard', 'noise', 'probend', 'probend_noise']

In [49]:
a_type = types[0]
df = dfs[0]
    
xs = features.copy()
xs += ["turns", "memory_usage"]
        
X = df[xs].values
y = df['Normalized_Rank'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

standard
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.886
Model:                            OLS   Adj. R-squared (uncentered):              0.886
Method:                 Least Squares   F-statistic:                          4.347e+05
Date:                Mon, 03 Feb 2020   Prob (F-statistic):                        0.00
Time:                        17:46:38   Log-Likelihood:                      2.1510e+05
No. Observations:             1005043   AIC:                                 -4.302e+05
Df Residuals:                 1005025   BIC:                                 -4.299e+05
Df Model:                          18                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
---------

For **standard types** the p-values are smaller than 0.05 thus, all variables have a significant effect. The variabls that affect the most due to higher coefficient values are $C_{max}$, $C_{mean}$, $C_r$ / $C_{mean}$ and $C_r$

In [16]:
a_type = types[1]
df = dfs[1]
    
xs = features.copy()
xs += ["noise", "turns", "memory_usage"]
        
X = df[xs].values
y = df['Normalized_Rank'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.912
Model:                            OLS   Adj. R-squared (uncentered):              0.912
Method:                 Least Squares   F-statistic:                          5.486e+05
Date:                Thu, 30 Jan 2020   Prob (F-statistic):                        0.00
Time:                        14:12:55   Log-Likelihood:                      3.4470e+05
No. Observations:             1005190   AIC:                                 -6.894e+05
Df Residuals:                 1005171   BIC:                                 -6.891e+05
Df Model:                          19                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
------------

For **noise types** all expect two feautures are important. The two which are not are _repetitions_ and _memory usage_. The rest of the features have a significant effect. The features with the highest coefficients, both negative and positive, are $C_r$, $C_{max}$, $C_{mean}$ and $C_r$ / $C_{max}$ and with -0.5 $C_r$ / $C_{mean}$

In [17]:
a_type = types[2]
df = dfs[2]
    
xs = features.copy()
xs += ["probend"]
        
X = df[xs].values
y = df['Normalized_Rank'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.900
Model:                            OLS   Adj. R-squared (uncentered):              0.900
Method:                 Least Squares   F-statistic:                          5.300e+05
Date:                Thu, 30 Jan 2020   Prob (F-statistic):                        0.00
Time:                        14:14:54   Log-Likelihood:                      2.7833e+05
No. Observations:             1005190   AIC:                                 -5.566e+05
Df Residuals:                 1005173   BIC:                                 -5.564e+05
Df Model:                          17                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
----------

Fpr **probabilistic ending** tournmanents all but two feautures are significant. The non significant features are size and the probability of the match ending. Regarding the significant features, the once with the highest coefficients positive and negative are: $C_{max}$, $C_{mean}$, $C_r$ / $C_{mean}$, $C_r$ / $C_{max}$

In [20]:
a_type = types[3]
df = dfs[3]
    
xs = features.copy()
xs += ["probend", "noise"]
        
X = df[xs].values
y = df['Normalized_Rank'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend_noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.895
Model:                            OLS   Adj. R-squared (uncentered):              0.895
Method:                 Least Squares   F-statistic:                          4.782e+05
Date:                Thu, 30 Jan 2020   Prob (F-statistic):                        0.00
Time:                        14:21:20   Log-Likelihood:                      2.5777e+05
No. Observations:             1005190   AIC:                                 -5.155e+05
Df Residuals:                 1005172   BIC:                                 -5.153e+05
Df Model:                          18                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
----

For **noisy probabilistic ending** tournaments all feautures are significant ($p$ value $< 0.5$).  The features with the highest effect on the normalised rank are $C_{max}$, $C_r$ / $C_{max}$, $C_{mean}$.  The effect is now lower than the rest of the tournaments types. 

The $R$s of the models are larger than 0.8 which means that these model explain the 80% of the variance in the data. The models are good.

In [72]:
tables = []
rs = []
for i, a_type in enumerate(types):
    df = dfs[i]

    xs = features.copy()
    if a_type == 'standard':
        xs += ["turns", "memory_usage"]
    if a_type == 'noise':
        xs += ["noise", "turns", "memory_usage"]
    if a_type == 'probend':
        xs += ["probend"]
    if a_type == 'probend_noise':    
        xs += ["probend", "noise"]

    X = df[xs].values
    y = df['Normalized_Rank'].values

    model = sm.OLS(y, X).fit()
    
    table = model.summary2(xname=xs)
    rs.append(table.tables[0][2][0] + table.tables[0][3][0])
    table = table.tables[1][['Coef.', 'P>|t|']].round(5)
    table.index = [features_labels[index] for index in table.index]
    
    tables.append(table)

In [77]:
pd.concat(tables, axis=1).round(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,Coef.,P>|t|,Coef..1,P>|t|.1,Coef..2,P>|t|.2,Coef..3,P>|t|.3
$CC$ to $C$ rate,-0.05,0.0,-0.073,0.0,0.009,0.0,0.009,0.0
$CD$ to $C$ rate,0.298,0.0,-0.018,0.0,0.192,0.0,0.072,0.0
$C_r$,-0.701,0.0,-0.959,0.0,0.348,0.0,-0.383,0.0
$C_r$ / $C_{max}$,2.587,0.0,2.389,0.0,1.679,0.0,1.348,0.0
$C_r$ / $C_{mean}$,-1.488,0.0,-0.492,0.0,-1.38,0.0,-0.07,0.0
$C_r$ / $C_{median}$,-0.189,0.0,0.026,0.0,0.455,0.0,-0.01,0.0
$C_r$ / $C_{min}$,0.086,0.0,-0.268,0.0,0.108,0.0,-0.202,0.0
$C_{max}$,1.961,0.0,1.544,0.0,1.411,0.0,0.78,0.0
$C_{mean}$,-2.538,0.0,-2.369,0.0,-2.578,0.0,-1.192,0.0
$C_{median}$,0.511,0.0,-0.079,0.0,0.47,0.0,-0.212,0.0


In [81]:
file = open('../paper/regression_results.tex', "w")
file.write(pd.concat(tables, axis=1).round(3).to_latex().replace('\$', '$').replace('\_', ('_')))
file.close()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [82]:
rs

['Adj. R-squared (uncentered):0.886',
 'Adj. R-squared (uncentered):0.912',
 'Adj. R-squared (uncentered):0.900',
 'Adj. R-squared (uncentered):0.895']