In [26]:
from csvcubed.inspect.inspectortable import Inspector

eurovision = Inspector(
    "~/code/inspector/eurovision/sweden-at-eurovision-complete-dataset.csv-metadata.json"
)
eurovision.tables

[DataCubeTable(csv_url='sweden-at-eurovision-complete-dataset.csv', title='Sweden at Eurovision Complete Dataset', shape=<CubeShape.Standard: 1>, data_set_uri='sweden-at-eurovision-complete-dataset.csv#dataset'),
 CodeListTable(csv_url='entrant.csv', title='Entrant', concept_scheme_uri='entrant.csv#code-list'),
 CodeListTable(csv_url='language.csv', title='Language', concept_scheme_uri='language.csv#code-list'),
 CodeListTable(csv_url='song.csv', title='Song', concept_scheme_uri='song.csv#code-list'),
 CodeListTable(csv_url='year.csv', title='Year', concept_scheme_uri='year.csv#code-list')]

In [27]:
primary_table = eurovision.tables[0] # Need access to primary table without indexing the list of tables
primary_table

DataCubeTable(csv_url='sweden-at-eurovision-complete-dataset.csv', title='Sweden at Eurovision Complete Dataset', shape=<CubeShape.Standard: 1>, data_set_uri='sweden-at-eurovision-complete-dataset.csv#dataset')

In [28]:
primary_table_columns = primary_table.columns
primary_table_columns # May be useful to be able to access components/column types directly here e.g. primary_table_columns.dimensions, primary_table_columns.measures etc

OrderedDict([('Year',
              DimensionColumn(dimension=LocalDimension(dimension_uri='sweden-at-eurovision-complete-dataset.csv#dimension/year', label='Year'))),
             ('Entrant',
              DimensionColumn(dimension=LocalDimension(dimension_uri='sweden-at-eurovision-complete-dataset.csv#dimension/entrant', label='Entrant'))),
             ('Song',
              DimensionColumn(dimension=LocalDimension(dimension_uri='sweden-at-eurovision-complete-dataset.csv#dimension/song', label='Song'))),
             ('Language',
              DimensionColumn(dimension=LocalDimension(dimension_uri='sweden-at-eurovision-complete-dataset.csv#dimension/language', label='Language'))),
             ('Value',
              StandardShapeObservationsColumn(unit=UnitsColumn(), measures_column=MeasuresColumn())),
             ('Measure', MeasuresColumn()),
             ('Unit', UnitsColumn()),
             ('Marker',
              AttributeColumn(attribute=LocalAttribute(attribute_uri='sweden

In [29]:
primary_table_columns["Language"]

DimensionColumn(dimension=LocalDimension(dimension_uri='sweden-at-eurovision-complete-dataset.csv#dimension/language', label='Language'))

In [30]:
primary_table_columns["Value"] # UnitsColumn and MeasuresColumn need to be fully implemented

StandardShapeObservationsColumn(unit=UnitsColumn(), measures_column=MeasuresColumn())

In [31]:
from csvcubed.inspect.inspectorcolumns import DimensionColumn
# See comment in Cell 3 - if there's a way to access column types directly through primary_table_columns, no need to import DimensionColumn object
dimension_col_titles = [
    title 
    for title, column in primary_table_columns.items() 
    if isinstance(column, DimensionColumn)
]

dimension_col_titles

['Year', 'Entrant', 'Song', 'Language']

In [32]:
from csvcubed.inspect.inspectorcolumns import MeasuresColumn
# See comment in Cell 6 re imports
measure_col_titles = [
    title 
    for title, column in primary_table_columns.items() 
    if isinstance(column, MeasuresColumn)
]

measure_col_titles

['Measure']

In [33]:
from csvcubed.inspect.inspectorcolumns import ObservationsColumn
# See comment in Cell 6 re imports
value_col_titles = [
    title
    for title, column in primary_table_columns.items()
    if isinstance(column, ObservationsColumn)
]
value_col_titles

['Value']

In [34]:
csv_url = primary_table.csv_url
csv_url

'sweden-at-eurovision-complete-dataset.csv'

In [35]:
data_cube_repository = primary_table.data_cube_repository

# Add dataframe accessor to Inspector (without errors?)
df, errors = data_cube_repository.get_dataframe(csv_url, include_suppressed_cols=False, dereference_uris=True)
df.head()

Unnamed: 0,Year,Entrant,Song,Language,Value,Measure,Unit,Marker
0,1958,Alice Babs,Lilla stjärna,Swedish,4.0,Final Rank,Unitless,
1,1958,Alice Babs,Lilla stjärna,Swedish,10.0,Final Points,Unitless,
2,1958,Alice Babs,Lilla stjärna,Swedish,1.0,People on Stage,Number,
3,1959,Brita Borg,Augustin,Swedish,9.0,Final Rank,Unitless,
4,1959,Brita Borg,Augustin,Swedish,4.0,Final Points,Unitless,


In [36]:
# Add pivoting function to Inspector
pivoted_df = df.pivot(index=dimension_col_titles, columns=measure_col_titles[0], values=value_col_titles[0]).dropna().reset_index()
pivoted_df.head()

Measure,Year,Entrant,Song,Language,Final Points,Final Rank,People on Stage
0,1958,Alice Babs,Lilla stjärna,Swedish,10.0,4.0,1.0
1,1959,Brita Borg,Augustin,Swedish,4.0,9.0,1.0
2,1960,Ted Gärdestad,Alla andra får varann,Swedish,4.0,10.0,1.0
3,1961,Lill Lindfors and Svante Thuresson,"April,april",Swedish,2.0,14.0,1.0
4,1962,Inger Berggren,Sol och vår,Swedish,4.0,7.0,1.0


In [37]:
pivoted_df.drop(dimension_col_titles, axis=1, inplace=True)
pivoted_df.head()

Measure,Final Points,Final Rank,People on Stage
0,10.0,4.0,1.0
1,4.0,9.0,1.0
2,4.0,10.0,1.0
3,2.0,14.0,1.0
4,4.0,7.0,1.0


In [38]:
from factor_analyzer import FactorAnalyzer, calculate_bartlett_sphericity, calculate_kmo

chi_sq, p = calculate_bartlett_sphericity(pivoted_df)
chi_sq, p


(33.351361349789244, 2.715324603643396e-07)

In [39]:
kmo_all, kmo_model = calculate_kmo(pivoted_df)
print(f"Bartlett Test of Sphericity results: Chi-squared: {chi_sq:.3f}, p-value: {p:.7f}")
print(f"Kaiser-Meyer-Olkin (KMO) results: KMO score per item: {kmo_all}, Overall KMO score: {kmo_model:.3f}")

Bartlett Test of Sphericity results: Chi-squared: 33.351, p-value: 0.0000003
Kaiser-Meyer-Olkin (KMO) results: KMO score per item: [0.49832955 0.4968913  0.49584678], Overall KMO score: 0.497


In [44]:
import plotly.express as px

c = pivoted_df.corr()
c
# fig = px.imshow(c, labels=dict(color="Correlation"), width=500, height=400)
# fig.show()

Measure,Final Points,Final Rank,People on Stage
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Final Points,1.0,-0.530855,0.463259
Final Rank,-0.530855,1.0,-0.137141
People on Stage,0.463259,-0.137141,1.0


In [41]:
fa = FactorAnalyzer(n_factors=3, rotation=None)
fa.fit(pivoted_df)
ev, v = fa.get_eigenvalues()
ev

array([1.77598209, 0.86422349, 0.35979442])

In [42]:
fig = px.line(x=range(1, pivoted_df.shape[1]+1), y=ev, width=500, height=400)
fig.show()

In [43]:
loads = fa.loadings_
zipped = sorted(zip(pivoted_df.columns, loads), key=lambda x: abs(x[1][0]), reverse=True)
list(zipped)
loads

array([[ 0.89912209,  0.02503715,  0.        ],
       [-0.60131768,  0.39154271,  0.        ],
       [ 0.5034576 ,  0.42293552,  0.        ]])