# GENERATE NEW EVENTS DB FILE

In [1]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
os.listdir('data')

['2022-03-13_eventsDB.csv',
 '2022-03-16_eventsDB.csv',
 '2022-03-17_eventsDB.csv',
 'adds1548_1554.csv',
 'backup_eventsDB_.js',
 'bkups',
 'celebrity_deaths_4.csv',
 'df_main.csv',
 'economists_lifetimes.csv',
 'eventsDB.js',
 'eventsDB.test',
 'eventsDB_.js',
 'eventsWIP.xlsx',
 'events_db.csv',
 'events_master.csv',
 'list_of_presidents_of_france-836j.csv',
 'list_of_prime_ministers_of_uk-839j.csv',
 'pantheon.tsv',
 'philosophers.csv',
 'prev_events_master.csv',
 'README.md',
 'US_Presidents.json',
 'world_events.js',
 'world_events.json',
 'world_events.txt']

In [3]:
master_file = 'data/events_master.csv'
df = pd.read_csv(master_file, encoding = "ISO-8859-1")
df = df.replace(r'^\s+$', np.nan, regex=True) #get rid of unwanted blank spaces in cells...

In [4]:
df['event'] = df['event'].str.strip()
#df = df.drop(['Alternatives'], axis=1)

In [5]:
print(df.shape, df.columns)

(215, 29) Index(['eventID', 'stem', 'event', 'details', 'wikipedia', 'mnemonic',
       'itemDifficulty', 'categories', 'America', 'Europe', 'Asia', 'Africa',
       'China', 'Discovery', 'Britain', 'France', 'Greek', 'India',
       'Invention', 'MiddleEast', 'prehistory', 'Religion', 'Roman', 'Royalty',
       'Russia', 'Science', 'Wars', 'World', 'YearNum'],
      dtype='object')


In [6]:
df.sample(3)

Unnamed: 0,eventID,stem,event,details,wikipedia,mnemonic,itemDifficulty,categories,America,Europe,...,MiddleEast,prehistory,Religion,Roman,Royalty,Russia,Science,Wars,World,YearNum
60,60,622,Flight of Mohammed from Mecca to Madina.,,,,6,"[Islam, ]",,,...,,,1.0,,,,,,,622
167,167,1959,Invention of the silicon chip is the major tec...,,,,5,,1.0,,...,,,,,,,,,,1959
171,171,1961,Yuri Gagarin of USSR becomes the first spaceman.,,,,5,,,,...,,,,,,1.0,,,,1961


In [7]:
# might need to do this for other columns as well.
categories = ['Asia', 'China', 'Britain', 'Roman']
for c in categories:
    df = df.astype({c: 'float64'})

In [8]:
#verify
df.select_dtypes('object')

Unnamed: 0,stem,event,details,wikipedia,mnemonic,categories
0,c.3500 BC,Invention of the wheel and plough in Mesopotam...,,,,"[prehistory,]"
1,c.3200 BC,Invention of writing in Mesopotamia: the means...,,,,"[prehistory,]"
2,c.3000 BC,Founding of the first cities in Sumeria (prese...,,,,"[prehistory,]"
3,c.3000 BC,Building of the Great Pyramid.,,,,"[prehistory,]"
4,c.1600 BC,Modern alphabet invented: the essential means ...,,,,"[prehistory,]"
...,...,...,...,...,...,...
210,1996,Dolly the sheep becomes the first mammal to be...,Her birth proved that specialised cells could ...,https://en.wikipedia.org/wiki/Dolly_(sheep),,
211,1997,Tony Blair back in power in UK. Mohd. Khatami ...,,,,
212,1998,Indonesian President Suharto resigns. Pakistan...,,,,
213,1999,G-15 Summit ends,,,,


In [12]:
print(df.select_dtypes('object').shape) # should be 6

(215, 6)


In [13]:
col = "America"
df[col].unique()
#cond = (df[col]=="1")
#df[cond]

array([nan,  1.])

In [14]:
df.describe().columns

Index(['eventID', 'itemDifficulty', 'America', 'Europe', 'Asia', 'Africa',
       'China', 'Discovery', 'Britain', 'France', 'Greek', 'India',
       'Invention', 'MiddleEast', 'prehistory', 'Religion', 'Roman', 'Royalty',
       'Russia', 'Science', 'Wars', 'World', 'YearNum'],
      dtype='object')

In [15]:
df.sum()

eventID                                                       23005
stem              c.3500 BCc.3200 BCc.3000 BCc.3000 BCc.1600 BCc...
event             Invention of the wheel and plough in Mesopotam...
itemDifficulty                                                 1229
America                                                          25
Europe                                                           79
Asia                                                             32
Africa                                                            8
China                                                            14
Discovery                                                         6
Britain                                                          29
France                                                            8
Greek                                                            17
India                                                            14
Invention                                       

In [16]:
diffcat  = pd.cut(df['itemDifficulty'], bins=3, labels=['E',"M", 'H'])
df['diffCat'] = diffcat

## Adding 2 Date based Columns
### 1. YearNum
### 2. TimePeriod (pd. cut)

In [17]:
def getYearNum(x):
    mult = 1
    if "BC" in x:
        x = x.replace('BC', '')                
        mult = -1
    x = x.replace('c.','') #handle circa
    x = x.replace('AD','')
        
    try:
        yr = mult * int(x)
    except:
        print('CONVERSION ERROR while attempting...')
        print(x)
        yr = x
        
    return(yr)

In [18]:
df['YearNum'] = df['stem'].apply(getYearNum)

In [19]:
bins = [-15000, 0, 500, 1000, 1500, 1700, 1800, 1900, 2100]
labels = ['BCE', '0AD-500AD', '500AD-1000AD', '1000AD-1500AD', '1500AD-1700AD', '1700s', '1800s', '1900-Present']
df['timePeriod'] = pd.cut(df['YearNum'], bins = bins, labels = labels)


# Add 3 new columns for Alternatives ("Wrong Answers")
- altsEasy
- altsMed
- altsHard

In [20]:
def get_valid_alts(baseyr, perturbs):
    """
    returns a list of valid distractors
    
    baseyr is the correct year. 
    @perturbs is a list of numbers to be added or subtracted
    """    
    options = [baseyr + p for p in perturbs] + [baseyr + (-1*p) for p in perturbs]
    return [dte for dte in options if (dte>0 and dte<2000)]


def get_valid_BC_alts(baseyr, perturbs):
    """
    returns a list of valid distractors
    
    baseyr is the correct year. 
    @perturbs is a list of numbers to be added or subtracted
    """    
    perturbs = [1,2,5]
    options = [baseyr + p for p in perturbs] + [baseyr + (-1*p) for p in perturbs]
    return [dte for dte in options if (dte>0 and dte<2000)]

    
    
diff_perturbs = [100,50, 20,10, 5, 2, 1]
med_perturbs = [100, 20,10, 5]
easy_perturbs = [200, 100,50,20]

perturbationDict = {'Hard': diff_perturbs, 'Medium': med_perturbs, 'Easy': easy_perturbs}

In [21]:
def get_alternatives(_str, difflevel='Hard'):
    
    
    nums = [int(s) for s in _str.split() if s.isdigit()]
    numstr = [s for s in _str.split() if s.isdigit()]
    
    options = None
    perturbs = perturbationDict[difflevel]

    if "BC" in _str:
        if "c." in _str: #circa BC, so give only approx alternatives.
            _str = _str.replace('c.','')
            _str = _str.replace('BC', '')
            nums = [int(s) for s in _str.split() if s.isdigit()]
            numstr = [s for s in _str.split() if s.isdigit()]
            if len(nums)==1:
                nstr = numstr[0]
                if nstr[-3:] == '000':
                    part = int(nstr[:-3])
                    options = get_valid_BC_alts(part, perturbs)
                    return [f' c. {x}000 BC' for x in options if x>0]

                if nstr[-2:] == '00':
                    part = int(nstr[:-2])
                    options = get_valid_BC_alts(part, perturbs)
                    return [f' c. {x}00 BC' for x in options if x >0]                
                
                if nstr[-1] == '0':
                    part = int(nstr[:-1])
                    options = get_valid_BC_alts(part, perturbs)
                    return [f' c. {x}0 BC' for x in options if x >0]                            
                else:
                    numpart = int(nstr)
                    options = get_valid_BC_alts(numpart, perturbs)
                    return [f' c. {x} BC' for x in options if x >0]                            
                    
            
        #case where there is BC but not circa
        #Exact BC year is presumed known
        elif len(nums)==1:
            part = int(nums[0])
            options = get_valid_alts(part, perturbs)
            return [f'{x} BC' for x in options if x>0]        
        
        else: #something not right
            print(f'something not right for {_str}')
            return _str
    
    
    elif 'c.AD' in _str:
        if len(nums)==1:
            a = nums[0]
            options = get_valid_alts(a, perturbs)
            return [f'c. {x} AD' for x in options]
        else:
            print(f'Error in {_str} {nums}')
        
    else: #AD single number event
        nums = [int(s) for s in _str.split() if s.isdigit()]
        if len(nums)==1:
            numpart = nums[0]
            options = get_valid_alts(numpart, perturbs)            
                
        return options

In [22]:
altuples = [('altsMid', 'Medium'), ('altsHard', 'Hard'), ('altsEasy', 'Easy') ]

for a in altuples:
    col, lvl = a
    print(col, lvl)
    df[col] = df['stem'].apply(get_alternatives, difflevel=lvl)
    print(f'{df[col].isnull().sum()} missing')
    fewest_options = df[col].apply(len).min()
    print(f' fewest options are {fewest_options}')
    
print('df now has 3 new columns with Alternatives')

altsMid Medium
0 missing
 fewest options are 4
altsHard Hard
0 missing
 fewest options are 5
altsEasy Easy
0 missing
 fewest options are 4
df now has 3 new columns with Alternatives


In [23]:
pd.set_option('display.max_colwidth', 120)
df[['stem', col]].sample(5)

Unnamed: 0,stem,altsEasy
13,588 BC,"[788 BC, 688 BC, 638 BC, 608 BC, 388 BC, 488 BC, 538 BC, 568 BC]"
103,1707,"[1907, 1807, 1757, 1727, 1507, 1607, 1657, 1687]"
60,622,"[822, 722, 672, 642, 422, 522, 572, 602]"
7,995 BC,"[1195 BC, 1095 BC, 1045 BC, 1015 BC, 795 BC, 895 BC, 945 BC, 975 BC]"
10,753 BC,"[953 BC, 853 BC, 803 BC, 773 BC, 553 BC, 653 BC, 703 BC, 733 BC]"


In [24]:
# Just for visual inspections
cond = (df[col].apply(len)==fewest_options)
df[cond][col]

188    [1780, 1880, 1930, 1960]
189    [1780, 1880, 1930, 1960]
190    [1782, 1882, 1932, 1962]
191    [1783, 1883, 1933, 1963]
192    [1785, 1885, 1935, 1965]
193    [1785, 1885, 1935, 1965]
194    [1785, 1885, 1935, 1965]
195    [1786, 1886, 1936, 1966]
196    [1787, 1887, 1937, 1967]
197    [1788, 1888, 1938, 1968]
198    [1789, 1889, 1939, 1969]
199    [1789, 1889, 1939, 1969]
200    [1790, 1890, 1940, 1970]
201    [1791, 1891, 1941, 1971]
202    [1793, 1893, 1943, 1973]
203    [1794, 1894, 1944, 1974]
204    [1795, 1895, 1945, 1975]
205    [1796, 1896, 1946, 1976]
206    [1796, 1896, 1946, 1976]
207    [1796, 1896, 1946, 1976]
208    [1796, 1896, 1946, 1976]
209    [1796, 1896, 1946, 1976]
210    [1796, 1896, 1946, 1976]
211    [1797, 1897, 1947, 1977]
212    [1798, 1898, 1948, 1978]
213    [1799, 1899, 1949, 1979]
214    [1799, 1899, 1949, 1979]
Name: altsEasy, dtype: object

-------
# Save the events DB file

In [25]:
from datetime import date

today = date.today()
print(f"Today's date: {today}")

Today's date: 2022-04-04


In [26]:
csvfilename = f'data/{today}_eventsDB.csv'
df.to_csv(csvfilename, index=False);
print(f'wrote file {csvfilename}')

wrote file data/2022-04-04_eventsDB.csv


In [27]:
if 0:
    row = df.sample(10).index
    new = df.loc[row]['Date'].apply(get_alternatives)
    pd.DataFrame(np.column_stack([df.loc[row]['Date'], new]))

In [28]:
#cut and paste this output to the file called: eventsDB.js
df.T.to_json('data/backup_eventsDB_.js')

In [29]:
bigstring = df.T.to_json()

In [33]:
outJSfile = f'data/eventsDB.js'
with open(outJSfile, "w") as text_file:
    text_file.write('events = ' + bigstring)

print(f'{outJSfile} written')

data/eventsDB.js written


In [34]:
added_columns = ['YearNum', 'timePeriod',
       'altsMid', 'altsHard', 'altsEasy', 'stem', 'diffCat']
for c in added_columns:
    print(c in df.columns)

True
True
True
True
True
True
True


In [32]:
df.shape

(215, 34)

----
# Plotting

In [None]:
df['YearNum'].diff().hist()

In [None]:
diffcat.value_counts()

In [None]:
diffcat.hist()

In [None]:
#fig, ax = plt.subplot(figsize=(10,10))
plt.figure(figsize=(12,8))
df['itemDifficulty'].hist()