In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Read data

In [2]:
data_train = pd.read_csv('../data/train.csv')
data_train.columns = ['galactic_year'] + list(data_train.columns[1:])
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865 entries, 0 to 3864
Data columns (total 80 columns):
galactic_year                                                                              3865 non-null int64
galaxy                                                                                     3865 non-null object
existence expectancy index                                                                 3864 non-null float64
existence expectancy at birth                                                              3864 non-null float64
Gross income per capita                                                                    3837 non-null float64
Income Index                                                                               3837 non-null float64
Expected years of education (galactic years)                                               3732 non-null float64
Mean years of education (galactic years)                                                   3502 non-null 

In [3]:
data_test = pd.read_csv('../data/test.csv')
data_test.columns = ['galactic_year'] + list(data_test.columns[1:])
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 79 columns):
galactic_year                                                                              890 non-null int64
galaxy                                                                                     890 non-null object
existence expectancy index                                                                 885 non-null float64
existence expectancy at birth                                                              885 non-null float64
Gross income per capita                                                                    885 non-null float64
Income Index                                                                               885 non-null float64
Expected years of education (galactic years)                                               885 non-null float64
Mean years of education (galactic years)                                                   882 non-null float64
In

In [4]:
data_train.shape, data_test.shape

((3865, 80), (890, 79))

In [5]:
sample_sub = pd.read_csv('../data/sample_submit.csv')
sample_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 3 columns):
index       890 non-null int64
pred        890 non-null float64
opt_pred    890 non-null int64
dtypes: float64(1), int64(2)
memory usage: 21.0 KB


In [6]:
sample_sub.head()

Unnamed: 0,index,pred,opt_pred
0,0,0.08,100
1,1,0.08,100
2,2,0.08,100
3,3,0.08,100
4,4,0.08,100


In [7]:
sample_sub.opt_pred.sum()

45900

## Basic exploration

In [8]:
data_train.head(2)

Unnamed: 0,galactic_year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,...,"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
0,990025,Large Magellanic Cloud (LMC),0.628657,63.1252,27109.23431,0.646039,8.240543,,,,...,,,,,,,,,,0.05259
1,990025,Camelopardalis B,0.818082,81.004994,30166.793958,0.852246,10.671823,4.74247,0.833624,0.467873,...,,,,,,19.177926,,22.785018,,0.059868


In [9]:
data_test.head(2)

Unnamed: 0,galactic_year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,...,Current health expenditure (% of GGP),"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII)
0,1007012,KK98 77,0.456086,51.562543,12236.576447,0.593325,10.414164,10.699072,0.547114,0.556267,...,,,,,,,,,,
1,1007012,Reticulum III,0.529835,57.228262,3431.883825,0.675407,7.239485,5.311122,0.497688,0.409969,...,,,,,,,,,,


In [10]:
data_train.galactic_year.unique()

array([ 990025,  991020,  992016,  993012,  994009,  995006,  996004,
        997002,  998001,  999000, 1000000, 1001000, 1002001, 1003002,
       1004004, 1005006, 1006009, 1007012, 1008016, 1009020, 1010025,
       1011030, 1012036, 1013042, 1014049, 1015056])

In [11]:
data_train.y.min(), data_train.y.max()

(0.013036451766586963, 0.6838126553011508)

In [12]:
galaxies_train = set(data_train.galaxy)
galaxies_test = set(data_test.galaxy)
len(galaxies_train), len(galaxies_test), len(galaxies_train & galaxies_test)

(181, 172, 172)

In [13]:
years_train = set(data_train.galactic_year)
years_test = set(data_test.galactic_year)

len(years_train), len(years_test), len(years_train & years_test), len(years_test | years_train)

(26, 10, 9, 27)

In [14]:
years_test - years_train

{1016064}

In [15]:
data_train[['galaxy', 'galactic_year']].drop_duplicates().shape

(3865, 2)

In [16]:
data_train.merge(data_test, on=['galaxy', 'galactic_year']).shape

(0, 157)

In [17]:
le = LabelEncoder()
years_list = sorted(list(set(data_train.galactic_year) | set(data_test.galactic_year)))
years_dict = {k: v + 1 for k, v in zip(years_list, le.fit_transform(years_list))}

In [18]:
data_train['year'] = data_train.galactic_year.map(years_dict)
galaxy_stat_train = data_train.groupby('galaxy').year\
    .agg([lambda x: x.nunique(), 'min', 'max']).reset_index()
galaxy_stat_train.columns = ['galaxy', 'train_nunique', 'train_min', 'train_max']

data_test['year'] = data_test.galactic_year.map(years_dict)
galaxy_stat_test = data_test.groupby('galaxy').year\
    .agg([lambda x: x.nunique(), 'min', 'max']).reset_index()
galaxy_stat_test.columns = ['galaxy', 'test_nunique', 'test_min', 'test_max']

galaxy_stat = galaxy_stat_train.merge(galaxy_stat_test, on='galaxy')

In [19]:
data_test['y'] = -1
data = pd.concat([data_train, data_test], axis=0, sort=True).reset_index(drop=True)\
    .sort_values(['galaxy', 'year'])
data['validation'] = np.where(data.y > 0, 0, 1)
data['val_diff'] = data.groupby('galaxy').validation.diff().fillna(0).apply(abs).astype(int)
galaxy_stat['val_diff'] = galaxy_stat.galaxy.map(data.groupby('galaxy').val_diff.sum())
galaxy_stat.head(10)

Unnamed: 0,galaxy,train_nunique,train_min,train_max,test_nunique,test_min,test_max,val_diff
0,Andromeda Galaxy (M31),18,1,26,8,18,25,2
1,Andromeda I,18,1,26,8,18,25,2
2,Andromeda II,25,1,25,2,26,27,1
3,Andromeda III,18,1,26,8,18,25,2
4,Andromeda IX,18,1,26,8,18,25,2
5,Andromeda V,18,1,26,8,18,25,2
6,Andromeda VIII,25,1,25,2,26,27,1
7,Andromeda X,17,1,17,10,18,27,1
8,Andromeda XI,18,1,26,8,18,25,2
9,Andromeda XV,25,1,25,2,26,27,1


In [22]:
galaxy_stat[galaxy_stat.val_diff == 1].shape

(44, 8)

In [23]:
galaxy_stat[galaxy_stat.val_diff == 2]

Unnamed: 0,galaxy,train_nunique,train_min,train_max,test_nunique,test_min,test_max,val_diff
0,Andromeda Galaxy (M31),18,1,26,8,18,25,2
1,Andromeda I,18,1,26,8,18,25,2
3,Andromeda III,18,1,26,8,18,25,2
4,Andromeda IX,18,1,26,8,18,25,2
5,Andromeda V,18,1,26,8,18,25,2
...,...,...,...,...,...,...,...,...
163,UGCA 438 (ESO 407-018),21,1,26,5,18,22,2
164,UGCA 86,24,1,26,2,23,24,2
167,Ursa Major II Dwarf,24,1,26,2,24,25,2
168,Ursa Minor Dwarf,24,1,26,2,24,25,2


In [103]:
galaxy_stat.galaxy.nunique()

172

In [104]:
galaxy_stat.val_diff.value_counts()

2    68
3    57
1    44
4     3
Name: val_diff, dtype: int64

In [109]:
galaxy_stat.test_min.value_counts()

18    90
23    37
26    25
24    20
Name: test_min, dtype: int64

In [111]:
galaxy_stat.train_min.value_counts()

1    172
Name: train_min, dtype: int64

In [110]:
galaxy_stat.train_max.value_counts()

26    82
25    49
24    22
22    13
17     6
Name: train_max, dtype: int64

## Idea 1: organize train in "sales prediction" style

In [114]:
data_train.shape

(3865, 81)

In [113]:
data_train[data_train.galaxy.isin(galaxies_test)].shape

(3664, 81)

In [141]:
df = data_train.merge(galaxy_stat[['galaxy', 'test_min']], on='galaxy')
df_train = df[df.year < df.test_min].reset_index(drop=True).drop('test_min', axis=1)
df_check = df[df.year >= df.test_min].reset_index(drop=True).drop('test_min', axis=1)
df_test = pd.DataFrame(data_test)
df_train.shape, df_check.shape, df_test.shape

((3429, 81), (235, 81), (890, 81))

# Energy dist - ????

In [142]:
def increase_coef(y):
    potential = 3 - np.log(0.01 + y)
    return (potential**2)/1000

In [144]:
sample_sub.head()

Unnamed: 0,index,pred,opt_pred
0,0,0.08,100
1,1,0.08,100
2,2,0.08,100
3,3,0.08,100
4,4,0.08,100


In [154]:
df_train.columns

Index(['galactic_year', 'galaxy', 'existence expectancy index',
       'existence expectancy at birth', 'Gross income per capita',
       'Income Index', 'Expected years of education (galactic years)',
       'Mean years of education (galactic years)',
       'Intergalactic Development Index (IDI)', 'Education Index',
       'Intergalactic Development Index (IDI), Rank',
       'Population using at least basic drinking-water services (%)',
       'Population using at least basic sanitation services (%)',
       'Gross capital formation (% of GGP)', 'Population, total (millions)',
       'Population, urban (%)',
       'Mortality rate, under-five (per 1,000 live births)',
       'Mortality rate, infant (per 1,000 live births)',
       'Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))',
       'Population, ages 15–64 (millions)',
       'Population, ages 65 and older (millions)',
       'Life expectancy at birth, male (galactic years)',
       'Life expect

In [155]:
train_sub = pd.DataFrame()
train_sub['index'] = list(range(len(df_train)))
train_sub['pred'] = df_train.y
train_sub['eei'] = df_train['existence expectancy index']
train_sub['likely_increase'] = increase_coef(train_sub.pred)
train_sub.head()

Unnamed: 0,index,pred,eei,likely_increase
0,0,0.05259,0.628657,0.033306
1,1,0.052115,0.627245,0.033394
2,2,0.052006,0.662135,0.033414
3,3,0.051675,0.687776,0.033476
4,4,0.051334,0.727717,0.033541


In [172]:
## calc optimal energy dist
opt_dist = train_sub[['index', 'likely_increase']].set_index('index')\
    .sort_values('likely_increase', ascending=False)
opt_dist['opt_pred'] = [100]*500 + [0]*(len(opt_dist) - 500)
train_sub['opt_pred'] = train_sub['index'].map(opt_dist.opt_pred)

train_sub.head()

Unnamed: 0,index,pred,eei,likely_increase,opt_pred
0,0,0.05259,0.628657,0.033306,0
1,1,0.052115,0.627245,0.033394,0
2,2,0.052006,0.662135,0.033414,0
3,3,0.051675,0.687776,0.033476,0
4,4,0.051334,0.727717,0.033541,0


In [173]:
train_sub.loc[train_sub.eei < 0.7, 'opt_pred'].sum(), train_sub.opt_pred.sum()

(26000, 50000)

In [157]:
sample_sub.opt_pred.value_counts()

100    100
80     100
70     100
60     100
50     100
40     100
30     100
20     100
10      90
Name: opt_pred, dtype: int64