In [1]:
# Reload all modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [2]:
import os
import datetime
import io
import pickle
import re
import json
import gzip
import cProfile
import lxml.etree
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
# import ray
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, box
import geopandas as geopd
import pandas as pd
import querier as qr
from dotenv import load_dotenv
load_dotenv()

import ses_ling.data.access as data_access
import ses_ling.utils.paths as path_utils
import ses_ling.utils.geometry as geo_utils
import ses_ling.utils.spatial_agg as spatial_agg
import ses_ling.utils.text_process as text_process
from ses_ling.language import Region, Language

In [3]:
paths = path_utils.ProjectPaths()
all_cntr_shapes = geopd.read_file(paths.countries_shapefile)

In [4]:
with open(paths.ext_data / 'countries.json') as f:
    countries_dict = json.load(f)
cc = 'GB'
cc_dict = countries_dict[cc]
year_from = 2015
year_to = 2021
assign_kwargs = dict(
    nighttime_acty_th = 0.5,
    all_acty_th = 0.1,
    count_th = 3,
)

In [6]:
_cc_init_params = {cc: {'cell_size': 'MSOA_BGC', 'ses_idx': "IMD"}}
lang = Language(
    'en', 'English', _cc_init_params, all_cntr_shapes, countries_dict,
    year_from=year_from, year_to=year_to,
    latlon_proj=cc_dict['xy_proj'], # TOCHANGE when several countries
    **assign_kwargs
)

In [None]:
metric_col = 'eduscore'
lang.read_metric(metric_col)
lang.cells_geodf.head()

Unnamed: 0_level_0,objectid,msoa11nm,msoa11nmw,bng_e,bng_n,long,lat,shape__are,shape__len,geometry,wavg,wvar,avg,var,min,max,weight,nr_units,wstd
msoa11cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E02000001,1,City of London 001,City of London 001,532378,181354,-0.09357,51.5156,2906361.0,8936.818478,"POLYGON ((-0.09676 51.52325, -0.09644 51.52282...",5.456302,10.202581,5.3635,72.967058,0.024,22.26,6687.0,6.0,3.194148
E02000002,2,Barking and Dagenham 001,Barking and Dagenham 001,548267,189693,0.138759,51.58659,2166163.0,8150.405928,"POLYGON ((0.14811 51.59678, 0.14809 51.59640, ...",31.115365,19.246909,30.48075,102.333271,17.349,38.753,7379.0,4.0,4.38713
E02000003,3,Barking and Dagenham 002,Barking and Dagenham 002,548259,188522,0.13815,51.57607,2143568.0,9118.196243,"POLYGON ((0.15065 51.58306, 0.14841 51.58075, ...",17.860425,2.308306,17.611,17.342949,12.848,23.035,10720.0,6.0,1.519311
E02000004,4,Barking and Dagenham 003,Barking and Dagenham 003,551004,186418,0.17683,51.55644,2491467.0,8206.551627,"POLYGON ((0.18511 51.56480, 0.18403 51.56391, ...",22.919929,9.01808,23.15025,51.392207,18.702,33.858,6536.0,4.0,3.003012
E02000005,5,Barking and Dagenham 004,Barking and Dagenham 004,548733,186827,0.144269,51.56071,1186053.0,6949.688798,"POLYGON ((0.14990 51.56807, 0.15078 51.56778, ...",27.022661,8.729057,27.5686,51.124119,19.124,37.624,9243.0,5.0,2.954498


# User mistakes

## Quick stats and checks

In [None]:
lang.user_corpora.head()

Unnamed: 0_level_0,nr_tweets,nr_words,nr_unique_words
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00030426231cf2773a81f9897ecbeb951cd2ee7bf2d0118694cf5292edee1720902aed6e5a30783b25d75b7171b376cb81750aef025074226c6d0211a6fb6a9f,64.0,2507,731
0007831e4682dde226017fbf5d97c58c537a05fd1b55bf0ca4382c3866bdecc18591c37aba8f0b40ccf4899473deb37da34e85ac3cc41e2875fa6660d0fbb955,124.0,1683,715
000faaaa6e11d566321800c3187781f6651521366dc43312f992b2a145a9e86447232fbb1e3c4f0e76e5ddc3f3f9000715ce6ae3f3c5e2a221c676faa28a85ec,37.0,445,258
001b4e7bcb4f4980e67d738ef38fe5b63f759e8cdb9938c4ac06354bb37d5f1dfe0e31e522a026e6ba70c2e4a5806b6d4cba9292d472cc6d792c68d9bf72a43c,100.0,1001,453
001e74edd2ca43f22a6a936eb5315f06bc62a3fcaa1ebd40a06fe05edc6f6687081134f00ba779a51c05e8379cd51c8e802d7bbd53cbaf3b864f9a16a50dc5ae,43.0,400,233


In [None]:
lang.user_corpora.shape

(358334, 3)

In [346]:
user_mistakes = lang.user_mistakes
user_mistakes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,freq_per_tweet,freq_per_word,nr_tweets,nr_words,nr_unique_words
user_id,cat_id,rule_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,CASING,CAPITALIZATION,2,0.023256,0.003106,86.0,644,123
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,CASING,COMMA_PARENTHESIS_WHITESPACE,1,0.011628,0.001553,86.0,644,123
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,CASING,EN_DIACRITICS_REPLACE,3,0.034884,0.004658,86.0,644,123
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,CASING,WHITESPACE_RULE,37,0.430233,0.057453,86.0,644,123
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,GRAMMAR,DEPEND_ON,1,0.011628,0.001553,86.0,644,123


In [356]:
cat_idx[pd.isnull(cat_idx)]

array([], dtype=object)

In [361]:
a = pd.concat([r.user_mistakes for r in lang.regions]).join(lang.lt_rules[[]])

In [363]:
a.loc[('WHITESPACE_RULE', slice(None), slice(None), ), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
rule_id,user_id,cat_id,Unnamed: 3_level_1
WHITESPACE_RULE,00030426231cf2773a81f9897ecbeb951cd2ee7bf2d0118694cf5292edee1720902aed6e5a30783b25d75b7171b376cb81750aef025074226c6d0211a6fb6a9f,,7
WHITESPACE_RULE,0007831e4682dde226017fbf5d97c58c537a05fd1b55bf0ca4382c3866bdecc18591c37aba8f0b40ccf4899473deb37da34e85ac3cc41e2875fa6660d0fbb955,,1
WHITESPACE_RULE,000faaaa6e11d566321800c3187781f6651521366dc43312f992b2a145a9e86447232fbb1e3c4f0e76e5ddc3f3f9000715ce6ae3f3c5e2a221c676faa28a85ec,,1
WHITESPACE_RULE,001b4e7bcb4f4980e67d738ef38fe5b63f759e8cdb9938c4ac06354bb37d5f1dfe0e31e522a026e6ba70c2e4a5806b6d4cba9292d472cc6d792c68d9bf72a43c,,21
WHITESPACE_RULE,001e74edd2ca43f22a6a936eb5315f06bc62a3fcaa1ebd40a06fe05edc6f6687081134f00ba779a51c05e8379cd51c8e802d7bbd53cbaf3b864f9a16a50dc5ae,,5
WHITESPACE_RULE,...,...,...
WHITESPACE_RULE,ffe6199fe44c2965329624f9f5cbdfe2cc610ac40ae726e94f38a294076117f96e482a5a706e594e4e1fa0a7fb89754e49475d8594407f7b5f5a2941f052b237,,261
WHITESPACE_RULE,ffe722f61fb277a9d093e4d3dbb4a693a8ae00fecd53527731d7d2394014876ba1e502071758a84444d85ced53aefd469188d6059d06eaac886975cacba11eb9,,4
WHITESPACE_RULE,ffe7d66852963f810b6bc8c4734daaaecf104f44b7356fdeac46725cc69f384a4cb1ab06c3e42e32969f2cf934ba7889a8a0afde81c5fce198ce85aeeb87b3c0,,1
WHITESPACE_RULE,fffba2d3bc65269d5ab57798e3f0b40aa069dd031ad50204af41d5d80ae36110906e7269ba60d19b2a23227bd381e394688a071c8ed7c4388f33e7af1100a2cf,,117


In [None]:
user_mistakes.index?

In [None]:
user_mistakes.reset_index('cat_id').fillna({'cat_id': })

In [367]:
x = a.index.get_level_values('cat_id').values
x[pd.isnull(x)] = a.index.get_level_values('rule_id')[pd.isnull(x)]
a.index.set_levels(x, level='cat_id')

In [None]:
a.index.set_levels(x, level='cat_id')

In [353]:
cat_idx = user_mistakes.index.get_level_values('cat_id').values
cat_idx[pd.isnull(cat_idx)] = user_mistakes.index.get_level_values('rule_id')[cat_idx.isnull()]
user_mistakes.index.set_levels(cat_idx, level='cat_id')

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [162]:
# OK fine, only java rules, which for the most part don't have a category
user_mistakes.loc[user_mistakes['name'].isnull()].index.get_level_values(0).unique()

Index(['COMMA_PARENTHESIS_WHITESPACE', 'DOUBLE_PUNCTUATION',
       'ENGLISH_WORD_REPEAT_BEGINNING_RULE', 'ENGLISH_WORD_REPEAT_RULE',
       'ENGLISH_WRONG_WORD_IN_CONTEXT', 'EN_A_VS_AN', 'EN_COMPOUNDS',
       'EN_CONTRACTION_SPELLING', 'EN_DIACRITICS_REPLACE', 'EN_SIMPLE_REPLACE',
       'EN_SPECIFIC_CASE', 'EN_UNPAIRED_BRACKETS', 'EN_WORD_COHERENCY',
       'SENTENCE_WHITESPACE', 'UPPERCASE_SENTENCE_START', 'WHITESPACE_RULE'],
      dtype='object', name='rule_id')

In [179]:
user_mistakes_cat = user_mistakes['mistake_freq_per_word'].groupby(['user_id', 'cat_id']).sum().rename('cat_freq_per_word').to_frame()
user_mistakes_cat['cat_freq_per_tweet'] = user_mistakes['mistake_freq_per_tweet'].groupby(['user_id', 'cat_id']).sum()

In [181]:
user_mistakes_cat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cat_freq_per_word,cat_freq_per_tweet
user_id,cat_id,Unnamed: 2_level_1,Unnamed: 3_level_1
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,CASING,0.06677,0.5
000003962867f2d312d942838cdb2535b950589f2e85f318e7565dcb45b7418fd3c9530e875ab83a28425f0748e1e0fc92959642cc470b8e0b1cbd4ddef1a8cb,GRAMMAR,0.003106,0.023256
00001a1b91726d1cd5d01471c6166c9fb93fdba2872ca96bbd75b4be81b9622a20913e24631ef03f358d0b015916fdc2643b1b3914b1e505a471b8e59d6ad74c,CASING,0.053628,0.573034
00001a1b91726d1cd5d01471c6166c9fb93fdba2872ca96bbd75b4be81b9622a20913e24631ef03f358d0b015916fdc2643b1b3914b1e505a471b8e59d6ad74c,CONFUSED_WORDS,0.002103,0.022472
00001a1b91726d1cd5d01471c6166c9fb93fdba2872ca96bbd75b4be81b9622a20913e24631ef03f358d0b015916fdc2643b1b3914b1e505a471b8e59d6ad74c,GRAMMAR,0.003155,0.033708


In [344]:
# Most common mistakes
# full_user_mistakes_cat = user_mistakes_cat.unstack(level=1).stack(dropna=False).fillna(0)
# full_user_mistakes_cat.groupby('cat_id').mean().sort_values(by='cat_freq_per_word', ascending=False)
(user_mistakes_cat.groupby('cat_id').sum() / user_mistakes_cat.index.levels[0].size).sort_values(by='cat_freq_per_word', ascending=False)

Unnamed: 0_level_0,cat_freq_per_word,cat_freq_per_tweet
cat_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CASING,0.034783,0.451809
PUNCTUATION,0.004569,0.062031
GRAMMAR,0.003474,0.044675
TYPOS,0.003009,0.037747
STYLE,0.001098,0.013196
CONFUSED_WORDS,0.000613,0.007687
TYPOGRAPHY,0.00044,0.005874
REDUNDANCY,0.000207,0.002938
COMPOUNDING,0.000194,0.002622
COLLOCATIONS,9.5e-05,0.001262


## Residence cell aggregation

In [202]:
cat_per_cell = user_mistakes_cat.loc[(slice(None), 'COLLOCATIONS'), ['cat_freq_per_word']].join(lang.user_residence_cell['cell_id']).groupby('cell_id')['cat_freq_per_word'].mean()
cat_per_cell.head()

cell_id
E02000001     0.00088
E02000002     0.00022
E02000003    0.001507
E02000004    0.001538
E02000005    0.000886
Name: cat_freq_per_word, dtype: Float64

In [207]:
lang.user_residence_cell.groupby('cell_id').count()

Unnamed: 0_level_0,count,prop_user,prop_user_by_time
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
E02000001,1558,1558,1558
E02000002,5,5,5
E02000003,18,18,18
E02000004,16,16,16
E02000005,12,12,12
...,...,...,...
W02000419,24,24,24
W02000420,359,359,359
W02000421,141,141,141
W02000422,110,110,110


In [208]:
lang.lt_categories

Unnamed: 0_level_0,name,type,default
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CASING,Upper/Lowercase,,
TYPOS,Possible Typo,misspelling,
COMPOUNDING,Compounding,,
GRAMMAR,Grammar,grammar,
COLLOCATIONS,Collocations,grammar,
PUNCTUATION,Punctuation,typographical,
CONFUSED_WORDS,Commonly Confused Words,misspelling,
NONSTANDARD_PHRASES,Nonstandard Phrases,misspelling,
REDUNDANCY,Redundant Phrases,style,
STYLE,Style,style,


In [None]:
lang.cells_geodf = lang.cells_geodf.join(l)

investigate influence of neihbouring cells, communities on speech. is a poor area surrounded by rich ones going t be speaking "more standard"? correlation Gi (not star!) with cell's own z-score

count all mistakes, diversity of mistakes, aggregate from clustering 
vector of mistake freq, PCA? into hierarchical, compare with socio economical

rank correlation for when imd is just given in terms of rank?

In [None]:
m = lang.cells_geodf.join(np.log10(cat_per_cell), how='inner').explore('cat_freq_per_word')

In [13]:
mistake_per_cell = user_mistakes.loc[(slice(None), 'GONNA'), ['rel_freq']].join(user_residence['cell_id']).groupby('cell_id')['rel_freq'].mean()
mistake_per_cell.head()

cell_id
E02000001    0.001659
E02000002    0.001827
E02000003    0.001235
E02000004    0.004452
E02000005    0.002591
Name: rel_freq, dtype: Float64

In [44]:
lang.cells_geodf.explore()

In [26]:
lang.cells_geodf.join(mistake_per_cell, how='inner')['rel_freq']

cell_id
E02000001    0.001659
E02000002    0.001827
E02000003    0.001235
E02000004    0.004452
E02000005    0.002591
               ...   
W02000419    0.007166
W02000420    0.001559
W02000421    0.001858
W02000422    0.001654
W02000423    0.002619
Name: rel_freq, Length: 7028, dtype: Float64

correlation betwen rules freqs to recover categories, see if they match LT's

In [None]:
lang.cells_geodf.join(np.log10(mistake_per_cell), how='inner').explore('rel_freq')