Linear Regression
=================

A multivariate linear regression is fitted to understand the effects of the features on the median score.

**Imports**

In [1]:
import pandas as pd
import dask.dataframe as dd

import imp

plot = imp.load_source('plot', '../src/plot.py')

  import imp


In [2]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
features = [
    "CC_to_C_rate",
    "CD_to_C_rate",
    "DC_to_C_rate",
    "DD_to_C_rate",
    "SSE",
    "Cooperation_rating",
    "Cooperation_rating_max",
    "Cooperation_rating_min",
    "Cooperation_rating_median",
    "Cooperation_rating_mean",
    "Cooperation_rating_comp_to_max",
    "Cooperation_rating_comp_to_min",
    "Cooperation_rating_comp_to_median",
    "Cooperation_rating_comp_to_mean",
    "repetitions",
]

# Main Paper Results

In [6]:
dfs = []
for name in ['standard', 'subset_noise', 'subset_probend', 'subset_probend_noise']:
    df = pd.read_csv(f'../data/{name}_processed.csv', index_col=0
                           )
    df['type'] = name
    dfs.append(df)

In [7]:
types = ['standard', 'noise', 'probend', 'probend_noise']

In [8]:
dfs[0].head()

Unnamed: 0,Rank,Name,Median_score,Cooperation_rating,Wins,Initial_C_rate,CC_rate,CD_rate,DC_rate,DD_rate,...,Cooperation_rating_max,Cooperation_rating_min,Cooperation_rating_median,Cooperation_rating_mean,Cooperation_rating_comp_to_max,Cooperation_rating_comp_to_min,Cooperation_rating_comp_to_median,Cooperation_rating_comp_to_mean,memory_usage,type
0,0,EvolvedLookerUp2_2_2,2.969925,0.704614,28.0,1.0,0.639084,0.06553,0.189481,0.105905,...,0.966144,0.0,0.664537,0.60628,0.729305,0.0,1.060307,1.162192,1.0,standard
1,1,Evolved FSM 16 Noise 05,2.874888,0.6968,21.0,1.0,0.676353,0.020448,0.135146,0.168054,...,0.966144,0.0,0.664537,0.60628,0.721218,0.0,1.04855,1.149304,1.0,standard
2,2,PSO Gambler 1_1_1,2.874216,0.684317,23.0,1.0,0.650701,0.033616,0.151543,0.16414,...,0.966144,0.0,0.664537,0.60628,0.708298,0.0,1.029765,1.128714,1.0,standard
3,3,PSO Gambler Mem1,2.861493,0.705582,23.0,1.0,0.663295,0.042287,0.144599,0.149819,...,0.966144,0.0,0.664537,0.60628,0.730308,0.0,1.061764,1.163789,0.007463,standard
4,4,Winner12,2.834701,0.681854,20.0,1.0,0.650588,0.031267,0.14092,0.177226,...,0.966144,0.0,0.664537,0.60628,0.705749,0.0,1.026059,1.124652,0.014925,standard


**Standard Tournaments**

In [18]:
a_type = types[0]
df = dfs[0]
    
xs = features.copy()
xs += ["turns", "memory_usage"]

In [19]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('Cooperation_rating_median')
xs.remove('DD_to_C_rate')

In [21]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [22]:
features_standard = xs.copy()

In [23]:
X = df[features_standard].values
y = df['Median_score'].values

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=['constant'] + xs))

standard
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.575
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                 1.132e+05
Date:                Fri, 01 Dec 2023   Prob (F-statistic):               0.00
Time:                        14:50:51   Log-Likelihood:                 98524.
No. Observations:             1002803   AIC:                        -1.970e+05
Df Residuals:                 1002790   BIC:                        -1.969e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
c

**Noisy Tournaments**

In [24]:
index = 1
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ['noise', 'memory_usage']

In [25]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('DD_to_C_rate')

In [26]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [27]:
features_noisy = xs.copy()

In [28]:
X = df[features_noisy].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.996
Model:                            OLS   Adj. R-squared (uncentered):              0.996
Method:                 Least Squares   F-statistic:                          1.788e+06
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        14:51:12   Log-Likelihood:                          52343.
No. Observations:              102337   AIC:                                 -1.047e+05
Df Residuals:                  102324   BIC:                                 -1.045e+05
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
------------

**Probabilistic Ending Tournaments**

In [29]:
index = 2
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ["probend"]

In [30]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('DD_to_C_rate')

In [32]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [33]:
features_probend = xs.copy()

In [34]:
X = df[features_probend].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                          2.464e+06
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        14:51:32   Log-Likelihood:                          50765.
No. Observations:               98702   AIC:                                 -1.015e+05
Df Residuals:                   98690   BIC:                                 -1.014e+05
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                                        coef    std err          t      P>|t|      [0.025      0.975]
----------

**Noisy Probabilistic Ending Tournaments**

In [35]:
index = 3
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ["probend", "noise"]

In [36]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('DD_to_C_rate')

In [37]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [38]:
features_probend_noisy = xs.copy()

In [39]:
X = df[features_probend_noisy].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend_noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.999
Model:                            OLS   Adj. R-squared (uncentered):              0.999
Method:                 Least Squares   F-statistic:                          6.224e+05
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        14:51:57   Log-Likelihood:                          10037.
No. Observations:                9681   AIC:                                 -2.005e+04
Df Residuals:                    9669   BIC:                                 -1.996e+04
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
------

In [40]:
tables = []
rs = []
for i, xs in enumerate([features_standard, features_noisy, features_probend, features_probend_noisy]):
    df = dfs[i]

    X = df[xs].values
    y = df['Median_score'].values

    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    
    table = model.summary2(xname=['constant'] + xs)
    rs.append(table.tables[0][2][0] + table.tables[0][3][0])
    table = table.tables[1][['Coef.', 'P>|t|']].round(5)
    table.index = ['constant'] + [plot.features_labels[index] for index in table.index[1:]]
    
    tables.append(table)

In [49]:
regression_table = pd.concat(tables, axis=1).round(3)

In [50]:
regression_table

Unnamed: 0,Coef.,P>|t|,Coef..1,P>|t|.1,Coef..2,P>|t|.2,Coef..3,P>|t|.3
constant,0.928,0.0,1.082,0.0,1.259,0.0,1.642,0.0
$CC$ to $C$ rate,0.043,0.0,0.104,0.0,0.024,0.0,-0.032,0.0
$CD$ to $C$ rate,-0.325,0.0,-0.052,0.0,-0.229,0.0,-0.11,0.0
$DC$ to $C$ rate,-0.204,0.0,-0.076,0.0,-0.102,0.0,-0.07,0.0
SSE,-0.294,0.0,-0.186,0.0,-0.131,0.0,-0.109,0.0
$C_{max}$,0.056,0.0,-0.06,0.011,-0.005,0.849,-0.24,0.0
$C_{min}$,0.156,0.0,-0.159,0.0,0.012,0.385,0.083,0.005
$C_{mean}$,1.838,0.0,2.247,0.0,1.822,0.0,2.03,0.0
$C_{min}$ / $C_r$,-0.049,0.0,0.04,0.0,-0.018,0.0,-0.065,0.0
$C_r$ / $C_{mean}$,0.552,0.0,-0.227,0.0,0.027,0.125,-0.043,0.0


In [51]:
file = open('../paper/regression_results_on_median_score.tex', "w")
table_to_write = regression_table.to_latex().replace('\$', '$').replace('\_', '_').replace('NaN', '  -')
file.write(table_to_write.replace('\{', '{').replace('\}', '}'))
file.close()

In [52]:
rs

['Adj. R-squared:0.575',
 'Adj. R-squared:0.561',
 'Adj. R-squared:0.488',
 'Adj. R-squared:0.762']

# Supplementary Information 

In [53]:
dfs = []
for name in ['standard', 'noise', 'probend', 'probend_noise']:
    df = pd.read_csv(f'../data/{name}_processed.csv', index_col=0
                           )
    df['type'] = name
    dfs.append(df)

In [54]:
types = ['standard', 'noise', 'probend', 'probend_noise']

In [55]:
dfs[0].head()

Unnamed: 0,Rank,Name,Median_score,Cooperation_rating,Wins,Initial_C_rate,CC_rate,CD_rate,DC_rate,DD_rate,...,Cooperation_rating_max,Cooperation_rating_min,Cooperation_rating_median,Cooperation_rating_mean,Cooperation_rating_comp_to_max,Cooperation_rating_comp_to_min,Cooperation_rating_comp_to_median,Cooperation_rating_comp_to_mean,memory_usage,type
0,0,EvolvedLookerUp2_2_2,2.969925,0.704614,28.0,1.0,0.639084,0.06553,0.189481,0.105905,...,0.966144,0.0,0.664537,0.60628,0.729305,0.0,1.060307,1.162192,1.0,standard
1,1,Evolved FSM 16 Noise 05,2.874888,0.6968,21.0,1.0,0.676353,0.020448,0.135146,0.168054,...,0.966144,0.0,0.664537,0.60628,0.721218,0.0,1.04855,1.149304,1.0,standard
2,2,PSO Gambler 1_1_1,2.874216,0.684317,23.0,1.0,0.650701,0.033616,0.151543,0.16414,...,0.966144,0.0,0.664537,0.60628,0.708298,0.0,1.029765,1.128714,1.0,standard
3,3,PSO Gambler Mem1,2.861493,0.705582,23.0,1.0,0.663295,0.042287,0.144599,0.149819,...,0.966144,0.0,0.664537,0.60628,0.730308,0.0,1.061764,1.163789,0.007463,standard
4,4,Winner12,2.834701,0.681854,20.0,1.0,0.650588,0.031267,0.14092,0.177226,...,0.966144,0.0,0.664537,0.60628,0.705749,0.0,1.026059,1.124652,0.014925,standard


**Standard Tournaments**

In [56]:
a_type = types[0]
df = dfs[0]
    
xs = features.copy()
xs += ["turns", "memory_usage"]

In [57]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('Cooperation_rating_median')
xs.remove('DD_to_C_rate')

In [59]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [60]:
features_standard = xs.copy()

In [61]:
X = df[features_standard].values
y = df['Median_score'].values

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=['constant'] + xs))

standard
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.575
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                 1.132e+05
Date:                Fri, 01 Dec 2023   Prob (F-statistic):               0.00
Time:                        15:00:41   Log-Likelihood:                 98524.
No. Observations:             1002803   AIC:                        -1.970e+05
Df Residuals:                 1002790   BIC:                        -1.969e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
c

**Noisy Tournaments**

In [62]:
index = 1
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ['noise', 'memory_usage']

In [63]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('Cooperation_rating_max')
xs.remove('DD_to_C_rate')

In [26]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [64]:
features_noisy = xs.copy()

In [65]:
X = df[features_noisy].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.991
Model:                            OLS   Adj. R-squared (uncentered):              0.991
Method:                 Least Squares   F-statistic:                          9.537e+06
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        15:01:01   Log-Likelihood:                          90403.
No. Observations:             1002950   AIC:                                 -1.808e+05
Df Residuals:                 1002939   BIC:                                 -1.807e+05
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------

**Probabilistic Ending Tournaments**

In [66]:
index = 2
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ["probend"]

In [67]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('DD_to_C_rate')

In [32]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [68]:
features_probend = xs.copy()

In [69]:
X = df[features_probend].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                          1.797e+07
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        15:01:13   Log-Likelihood:                      2.2454e+05
No. Observations:             1002950   AIC:                                 -4.490e+05
Df Residuals:                 1002939   BIC:                                 -4.489e+05
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------

**Noisy Probabilistic Ending Tournaments**

In [70]:
index = 3
a_type = types[index]
df = dfs[index]
    
xs = features.copy()
xs += ["probend", "noise"]

In [71]:
xs.remove('Cooperation_rating')
xs.remove('Cooperation_rating_median')
xs.remove('Cooperation_rating_comp_to_median')
xs.remove('Cooperation_rating_comp_to_max')
xs.remove('Cooperation_rating_comp_to_min')
xs.remove('Cooperation_rating_min')
xs.remove('noise')
xs.remove('DD_to_C_rate')

In [37]:
plt.figure(figsize=(12,10))

cor = df[xs].corr()

sns.heatmap(cor, annot=True, cmap="viridis");

In [72]:
features_probend_noisy = xs.copy()

In [73]:
X = df[features_probend_noisy].values
y = df['Median_score'].values

model = sm.OLS(y, X).fit()

print("============")
print(f"{a_type}")
print("============")
print(model.summary(xname=xs))

probend_noise
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.987
Method:                 Least Squares   F-statistic:                          8.514e+06
Date:                Fri, 01 Dec 2023   Prob (F-statistic):                        0.00
Time:                        15:01:25   Log-Likelihood:                         -65295.
No. Observations:             1002950   AIC:                                  1.306e+05
Df Residuals:                 1002941   BIC:                                  1.307e+05
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
------

In [74]:
tables = []
rs = []
for i, xs in enumerate([features_standard, features_noisy, features_probend, features_probend_noisy]):
    df = dfs[i]

    X = df[xs].values
    y = df['Median_score'].values

    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    
    table = model.summary2(xname=['constant'] + xs)
    rs.append(table.tables[0][2][0] + table.tables[0][3][0])
    table = table.tables[1][['Coef.', 'P>|t|']].round(5)
    table.index = ['constant'] + [plot.features_labels[index] for index in table.index[1:]]
    
    tables.append(table)

In [75]:
regression_table = pd.concat(tables, axis=1).round(3)

In [76]:
regression_table

Unnamed: 0,Coef.,P>|t|,Coef..1,P>|t|.1,Coef..2,P>|t|.2,Coef..3,P>|t|.3
constant,0.928,0.0,2.143,0.0,2.466,0.0,1.824,0.0
$CC$ to $C$ rate,0.043,0.0,-0.468,0.0,0.223,0.0,-0.008,0.0
$CD$ to $C$ rate,-0.325,0.0,0.105,0.0,0.06,0.0,0.074,0.0
$DC$ to $C$ rate,-0.204,0.0,0.06,0.0,0.066,0.0,-0.002,0.011
SSE,-0.294,0.0,-0.365,0.0,0.055,0.0,-0.035,0.0
$C_{max}$,0.056,0.0,,,-0.045,0.081,-0.181,0.0
$C_{min}$,0.156,0.0,0.264,0.0,0.311,0.0,,
$C_{mean}$,1.838,0.0,2.046,0.0,1.506,0.0,2.273,0.0
$C_{min}$ / $C_r$,-0.049,0.0,-0.252,0.0,-0.204,0.0,,
$C_r$ / $C_{mean}$,0.552,0.0,-0.579,0.0,-1.137,0.0,-0.61,0.0


In [77]:
file = open('../paper/regression_results_on_median_score_full_dataset.tex', "w")
table_to_write = regression_table.to_latex().replace('\$', '$').replace('\_', '_').replace('NaN', '  -')
file.write(table_to_write.replace('\{', '{').replace('\}', '}'))
file.close()

In [78]:
rs

['Adj. R-squared:0.575',
 'Adj. R-squared:0.666',
 'Adj. R-squared:0.816',
 'Adj. R-squared:0.869']