In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
cicero_df = pd.read_csv('../data/cicero_df.csv')
cicero_df.head()

Unnamed: 0,title,total_clausulae,total_excluded,abbrev_excluded,bracket_excluded,short_excluded,cretic-trochee (-u--x),cretic-trochee 1 res (uuu--x),cretic-trochee 1 res (-uuu-x),cretic-trochee 1 res (-u-uux),...,hypodochmiac 1 res (-uuuux),spondaic (---x),heroic (-uu-x),first paeon (-uux),choriamb trochee (-uu--x),short sequence (uuuuux),total_artistic,misc_clausulae,total_unartistic,percent_clausulae
0,academica,228,8,0,5,2,32.0,5.0,11.0,7.0,...,1.0,20.0,7.0,7.0,4.0,1.0,146.0,74.0,113.0,1.0
1,arati phaenomena,250,19,0,19,0,6.0,0.0,3.0,0.0,...,0.0,9.0,185.0,8.0,2.0,0.0,21.0,210.0,414.0,1.0
2,arati prognotica,9,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,9.0,18.0,1.0
3,brutus,1655,146,74,12,53,210.0,28.0,67.0,18.0,...,1.0,95.0,13.0,31.0,4.0,3.0,1126.0,383.0,529.0,1.0
4,carmina fragmenta,125,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,8.0,54.0,3.0,3.0,0.0,56.0,69.0,137.0,1.0


In [3]:
speeches = [
       'de domo sua', 'de haruspicum responso', 'de lege agraria 1',
       'de lege agraria 2', 'de lege agraria 3',
       'de provinciis consularibus', 'in caecilium', 'in catilinam 1',
       'in catilinam 2', 'in catilinam 3', 'in catilinam 4', 'in pisonem',
       'in vatinium', 'in verrem 1', 'in verrem 2 1', 'in verrem 2 2',
       'in verrem 2 3', 'in verrem 2 4', 'in verrem 2 5', 'philippicae 1',
       'philippicae 10', 'philippicae 11', 'philippicae 12',
       'philippicae 13', 'philippicae 14', 'philippicae 2',
       'philippicae 3', 'philippicae 4', 'philippicae 5', 'philippicae 6',
       'philippicae 7', 'philippicae 8', 'philippicae 9',
       'post reditum ad populum', 'post reditum in senatu', 'pro archia',
       'pro balbo', 'pro caecina', 'pro caelio', 'pro cluentio',
       'pro flacco', 'pro fonteio', 'pro lege manilia', 'pro ligario',
       'pro marcello', 'pro milone', 'pro murena', 'pro plancio',
       'pro quinctio', 'pro rabirio perduellionis reo',
       'pro rabirio postumo', 'pro rege deiotaro', 'pro roscio amerino',
       'pro roscio comoedo', 'pro scauro', 'pro sestio', 'pro sulla',
       'pro tullio'
]
cicero_speeches = cicero_df.loc[cicero_df['title'].isin(speeches)]
print('Number of speeches: ', len(cicero_speeches))

Number of speeches:  58


In [4]:
cicero_speeches['title']

8                        de domo sua
11            de haruspicum responso
15                 de lege agraria 1
16                 de lege agraria 2
17                 de lege agraria 3
24        de provinciis consularibus
54                      in caecilium
56                    in catilinam 1
57                    in catilinam 2
58                    in catilinam 3
59                    in catilinam 4
60                        in pisonem
62                       in vatinium
64                       in verrem 1
65                     in verrem 2 1
66                     in verrem 2 2
67                     in verrem 2 3
68                     in verrem 2 4
69                     in verrem 2 5
78                     philippicae 1
79                    philippicae 10
80                    philippicae 11
81                    philippicae 12
82                    philippicae 13
83                    philippicae 14
84                     philippicae 2
85                     philippicae 3
8

In [5]:
contingency_table = cicero_speeches[['title', 'total_artistic', 'total_unartistic']]

In [6]:
total_artistic = contingency_table['total_artistic'].sum()
total_unartistic = contingency_table['total_unartistic'].sum()

In [7]:
def chi_square_statistic(artistic, unartistic, p_value=False):
    obs = np.array([
        [artistic, unartistic],
        [total_artistic-artistic, total_unartistic-unartistic]
    ])
    chi2, p, _, _ = stats.chi2_contingency(obs, correction=False)
    
    if p_value:
        return round(p, 5)
    
    return chi2

In [8]:
contingency_table['chi2'] = contingency_table.apply(
    lambda row: chi_square_statistic(row['total_artistic'], row['total_unartistic']),
    axis=1
)
contingency_table['p'] = contingency_table.apply(
    lambda row: chi_square_statistic(row['total_artistic'], row['total_unartistic'], p_value=True),
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [9]:
print('Total Artistic: ', total_artistic)
print('Total Unartistic: ', total_unartistic)

Total Artistic:  16261.0
Total Unartistic:  12398.0


In [10]:
contingency_table

Unnamed: 0,title,total_artistic,total_unartistic,chi2,p
8,de domo sua,502.0,312.0,8.299675,0.00397
11,de haruspicum responso,262.0,190.0,0.280766,0.5962
15,de lege agraria 1,98.0,59.0,2.075515,0.14968
16,de lege agraria 2,337.0,303.0,4.446756,0.03497
17,de lege agraria 3,55.0,40.0,0.051816,0.81993
24,de provinciis consularibus,180.0,96.0,8.160206,0.00428
54,in caecilium,204.0,149.0,0.160768,0.68845
56,in catilinam 1,119.0,91.0,0.000458,0.98292
57,in catilinam 2,131.0,59.0,11.612846,0.00065
58,in catilinam 3,84.0,79.0,1.80997,0.17851
