# AESOP - participant

Účastníci seminářů MFF UK, propagačních akcí, olympiád, soutěží...
Nástup na MFF 2007-2015

In [1]:
import re
import pandas as pd
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 10)
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
import numpy as np
%matplotlib notebook
from sklearn import cross_validation, svm
from sklearn import metrics
from sklearn.metrics import classification_report

from __future__ import division




## Konkrétní data
- region - 0 pokud neznámý, jinak CZxxx - NUTS kód, hierarchický (levá cifra je nejvýznamnější)
- student_until - očekávaný rok maturity
- school-XXX - 1 pokud student studoval na škole typu XXX - G_Y jsou Y-letá gymnázia
- AKCE-ROK-SLOUPEC - Účast na akci AKCE v roce ROK. Sloupce jsou attended [0/1], rank a maxrank. Rank je pořadí z celkového počtu (maxrank).

In [2]:
dataset_diff= pd.read_csv('dataset_diff.csv', low_memory=False, index_col=False)
dataset_absolute= pd.read_csv('dataset_absolute.csv', low_memory=False, index_col=False)
dataset_diff

Unnamed: 0,region,student_until,school-G_8,school-G_6,school-G_5,school-G_4,school-SS_M,school-SS_VL,school-SV,school-ZS,akademia_vapac-0010-attended,akademia_vapac-0010-rank,...,vyfuk_rocnik_UNK-0011-attended,vyfuk_rocnik_UNK-0011-rank,vyfuk_rocnik_UNK-0011-maxrank,vyfuk_rocnik_UNK-0014-attended,vyfuk_rocnik_UNK-0014-rank,vyfuk_rocnik_UNK-0014-maxrank,vyfuk_tabor-0011-attended,vyfuk_tabor-0011-rank,vyfuk_tabor-0011-maxrank,vyfuk_tabor-0015-attended,vyfuk_tabor-0015-rank,vyfuk_tabor-0015-maxrank
0,0,2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
1,CZ063,2016,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
2,0,2015,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
3,CZ052,2014,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
4,0,2014,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12198,CZ020,2009,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
12199,0,2015,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
12200,0,2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0
12201,0,2013,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0


## Příprava dat

Oba datasety jsou zpracovány předzpracovány do několika kategorií featur.

Hlavní kategorie je "core" kde se nachází student_until a school_XXX jako jen kopie z původních dat. Mohl by zde byt i region, ale je na vás abyste vymysleli jeho vhodné zakódování.

Dále je potřeba zpracovat každý sloupec odpovídající akci. Akce je nejprve zařazena do některé z tříd akcí (viz funkce event_2_class). Pokud se jedná o cílovou třídu (nástup/přijetí/absolvování mff), nevznikají nové featury a ukládá se pouze sloupec se samotnou třídou. Jinak ke každé akci vznikají následující kategorie featur:

- event - celé jméno akce
- class - třída akce
- year - rok akce
- class_year - třída + rok akce

Pokud featura s daným jménem již byla vytvořena dříve, tak se sčítají.

Vždy je také vytvořena featura se suffixem '_ranked' kde je jako hodnota použito jak dobře se student v akci umístníl (v rozmezí 0-1, vetší hodnota~lepší pořadí).

Všechny featury jsou také k dispozici v kategorii 'all'.

Tyto featury nemusí být ty nejlepší, takže klidně zkoušejte i jiné způsoby extrakce featur :)

Sloupce featur je možné si prohlédnout např v datasests['diff']['features'] zobrazeným pod následujícím kódem:

In [3]:
def event_2_class(event):
    e = event.split('_')
    e0 = e[0].lower()
    if e0 in ['clo', 'naboj', 'maso']:
        return 'contest',None
    if e0 in ['dod', 'jdf', 'akademia', 'jdi', 'anketa', 'gaudeamus', 'pro']:
        return 'prop',None
    if e0 in ['smf', 'smfm', 'lmfs', 'matfyzfeat']:
        return 'camp',None
    if e0 in ['mam', 'vyfuk', 'fykos', 'pikomat', 'prase', 'ksp', 'pralinka']:
        return e0,None
    if e0 in ['mff']:
        e3 = e[3].lower()
        return e0,e3
    if e0 in ['olymp', 'lo', 'bioolymp', 'zo']:#soc?
        return 'olymp',None
    if e0 in ['mklokan', 'pklokan', 'klokan', 'soc', 'soutez', 'brkos', 'cmsprosos', 'tmf', 'todo', 'pythagoriáda']:
        return 'soutez',None
    print e0
    return 'other',0

def prepare(d):
    d_out = pd.DataFrame()
    feat = dict()
    def store_feature(colname_new, feature_name, col_value, divided, d_out, feat):
        colname_new_ranked = colname_new+'-ranked'
        feature_name_ranked = feature_name+'-ranked'
        if feature_name not in feat:
            feat[feature_name] = []
            feat[feature_name_ranked] = []
        if colname_new in d_out.columns.values:#akce jiz byla pridana do vystupniho datasetu
            d_out[colname_new] += col_value
            d_out[colname_new_ranked] += divided
        else:
            feat[feature_name].append(colname_new)
            feat[feature_name_ranked].append(colname_new_ranked)
            d_out[colname_new] = col_value
            d_out[colname_new_ranked] = divided

    feat['core'] = []
    targets = []
    for colname in d.columns.values:
        colname_split = colname.split('-')
        if len(colname_split) == 1: #region, student_until
            if colname != 'region':
                d_out[colname]=d[colname]
                feat['core'].append(colname)
        elif len(colname_split) == 2: #school-XXX
            d_out[colname]=d[colname]
            feat['core'].append(colname)
        elif len(colname_split) == 3: #AKCE-ROK-TYP
            (event, year, type_) = colname_split
            if type_ != 'attended':#rank a maxrank vyuzijeme naraz
                continue
            (class_, target) = event_2_class(event)
            if target is not None: #cilova trida
                if target in d_out.columns.values:
                    d_out[target]+=d[colname]
                else:
                    targets.append(target)
                    d_out[target]=d[colname]
            else: #akce co mame na vstupu
                colname_rank = '%s-%s-rank' %(event, year)
                colname_maxrank = '%s-%s-maxrank' %(event, year)
                divided = 1 - d[colname_rank]/d[colname_maxrank]
                divided[np.isnan(divided)] = 0

                #sloupce za akce bez let
                store_feature(event, 'event', d[colname], divided, d_out, feat)
                #sloupce za tridy akci bez let
                store_feature(class_, 'class', d[colname], divided, d_out, feat)
                #sloupce za roky bez akci
                store_feature(year, 'year', d[colname], divided, d_out, feat)
                #sloupce za roky a tridy
                class_year = '%s-%s' %(class_, year)
                store_feature(class_year, 'class_year', d[colname], divided, d_out, feat)
    d_out[np.isinf(d_out)] = 0
    feat['all'] = [f for fc in feat for f in feat[fc]]#flatten list of lists
    return (d_out, feat, targets)

(data_absolute, features_absolute, targets_absolute) = prepare(dataset_absolute)
(data_diff, features_diff, targets_diff) = prepare(dataset_diff)

datasets = {
    'absolute': {
        'data': data_absolute, 
        'features': features_absolute,
        'targets': targets_absolute,
    },
    'diff': {
        'data': data_diff, 
        'features': features_diff,
        'targets': targets_diff,
    },
}
datasets['diff']['features']

{'all': ['student_until',
  'school-G_8',
  'school-G_6',
  'school-G_5',
  'school-G_4',
  'school-SS_M',
  'school-SS_VL',
  'school-SV',
  'school-ZS',
  '0010-ranked',
  '0011-ranked',
  '0012-ranked',
  '0013-ranked',
  '0014-ranked',
  '0007-ranked',
  '0008-ranked',
  '0009-ranked',
  '0003-ranked',
  '0005-ranked',
  '0006-ranked',
  '0015-ranked',
  '0016-ranked',
  '0017-ranked',
  '0018-ranked',
  '0019-ranked',
  '0020-ranked',
  '0021-ranked',
  'prop-0010',
  'prop-0011',
  'prop-0012',
  'prop-0013',
  'soutez-0011',
  'soutez-0012',
  'soutez-0013',
  'soutez-0014',
  'contest-0007',
  'contest-0008',
  'contest-0010',
  'contest-0011',
  'contest-0012',
  'contest-0013',
  'contest-0014',
  'soutez-0007',
  'soutez-0010',
  'soutez-0008',
  'prop-0009',
  'fykos-0010',
  'fykos-0011',
  'fykos-0012',
  'fykos-0013',
  'fykos-0003',
  'fykos-0005',
  'fykos-0006',
  'fykos-0007',
  'fykos-0008',
  'fykos-0009',
  'fykos-0014',
  'fykos-0015',
  'fykos-0016',
  'ksp-0011

## Rozdělení na test/train/future

- train - studenti co odmaturovali pred rokem 2015
- test - studenti co odmaturovali v roce 2015
- future - studenti co si myslíme že budou maturovat v roce 2016

In [4]:
def split_(df):
    return df[df['student_until']<2015], df[df['student_until']==2015],df[df['student_until']>2015],

for key, value in datasets.iteritems():
    value['train'], value['test'], value['future'] = split_(value['data'])

In [10]:
for key, value in datasets.iteritems():
    (data, features, targets) = value['data'], value['features'], value['targets']
    for t in targets:
        print "Testing baseline with target "+t+" for dataset " + key
        y = data[t]>0#nekteri lide se prihlasili/byli prijati vicekrat
        fs = features['all']
        X = data[fs]
        
        p_none = y*0
        p_all = p_none+1
        print "none"
        print(classification_report(y, p_none))
        print "all"
        print(classification_report(y, p_all))
        print

Testing baseline with target absolvoval for dataset diff
none
             precision    recall  f1-score   support

      False       0.97      1.00      0.99     11848
       True       0.00      0.00      0.00       355

avg / total       0.94      0.97      0.96     12203

all
             precision    recall  f1-score   support

      False       0.00      0.00      0.00     11848
       True       0.03      1.00      0.06       355

avg / total       0.00      0.03      0.00     12203


Testing baseline with target prihlasen for dataset diff
none
             precision    recall  f1-score   support

      False       0.85      1.00      0.92     10419
       True       0.00      0.00      0.00      1784

avg / total       0.73      0.85      0.79     12203

all
             precision    recall  f1-score   support

      False       0.00      0.00      0.00     10419
       True       0.15      1.00      0.26      1784

avg / total       0.02      0.15      0.04     12203


Testing

  'precision', 'predicted', average, warn_for)


In [11]:
def my_f(corr, pred):
    success = pred ==corr
    tpS = pred*corr
    fpS = pred*(1-corr)
    fnS = (1-pred)*corr
    tnS = (1-pred)*(1-pred)
    tp = tpS.sum()
    fp = fpS.sum()
    fn = fnS.sum()
    tn = tnS.sum()
    print success.sum(), tp, fp, fn, tn
    print "Correct %f %%"%((success.sum()*1.0)/(0.0+success.sum()+fn+fp))
    precision = 0
    recall = 0
    f = 0
    if tp>0:
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f = 2*precision*recall/(precision+recall)
    print precision, recall, f
    return (precision, recall, f)

## Ukázka klasifikátorů:

otázky ke zkoumání:
- predikce jednotlivých kategorií
- jaké featury jsou vhodné?
- možnost predikce lidi co budou maturovat letos, minimálně co se týče přihlášení

In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

#targets = ['prihlasen', 'prijat']#, 'absolvoval']
#targets = ['prihlasen', 'prijat']
targets = ['prihlasen']
classifiers = dict(
    dectree=DecisionTreeClassifier(min_samples_split=20, random_state=99),
    #dectree2=DecisionTreeClassifier(min_samples_split=10, random_state=999),
    svm=svm.SVC(),
    rfc=RandomForestClassifier(max_depth=5, n_estimators=15, max_features=5),
    ada=AdaBoostClassifier(),
    #xtra=ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0),
)
results = dict()
for t in targets:
    results[t]=dict()
    for key, value in datasets.iteritems():
        results[t][key]=dict()
        for cls_name, cls in classifiers.iteritems():
            print "dataset " +key+ " target "+t+" cls " + cls_name
            d_train = value['train']
            d_test = value['train']
            y = d_train[t]>0
            y_test = d_test[t]>0
            #fs = value['features']['all']
            fs = value['features']['core']+value['features']['class']+value['features']['year']+value['features']['class_year']
            X = d_train[fs]
            
            cls.fit(X, y)
            pred = cls.predict(d_test[fs])
            results[t][key][cls] = my_f(y_test, pred)
            print 

dataset diff target prihlasen cls rfc
4858 151 25 1238 5945
Correct 0.793661 %
0.857954545455 0.108711303096 0.192971246006

dataset diff target prihlasen cls ada
4969 404 167 985 5550
Correct 0.811795 %
0.707530647986 0.290856731461 0.412244897959

dataset diff target prihlasen cls svm
4940 283 75 1106 5763
Correct 0.807058 %
0.790502793296 0.203743700504 0.323983972524

dataset diff target prihlasen cls dectree
5175 559 116 830 5446
Correct 0.845450 %
0.828148148148 0.402447804176 0.541666666667

dataset absolute target prihlasen cls rfc
4847 128 13 1261 5980
Correct 0.791864 %
0.90780141844 0.0921526277898 0.167320261438

dataset absolute target prihlasen cls ada
4954 378 156 1011 5587
Correct 0.809345 %
0.707865168539 0.272138228942 0.393135725429

dataset absolute target prihlasen cls svm
4915 257 74 1132 5790
Correct 0.802973 %
0.776435045317 0.185025197984 0.298837209302

dataset absolute target prihlasen cls dectree
5149 557 140 832 5424
Correct 0.841202 %
0.799139167862 0.4010

  unsupported[op_str]))


In [18]:
#datasets['diff']['train']