In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()


import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
df = pd.concat([train, test], sort=False).drop(columns='target')
df.shape

(500000, 24)

In [5]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8


# Data Exploration

In [6]:
train.keys()

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')

In [7]:
test.keys()

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month'],
      dtype='object')

In [8]:
cat_features = ['nom_0','nom_1','nom_2','nom_3','nom_4']
ordinal_features = ['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
binary_features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
features_tobeHashed = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
categorical_features = ['nom_0','nom_1','nom_2','nom_3','nom_4','nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

## Missing values

In [9]:
print('Training data contains: ',train.isnull().sum().sum(),'Missing values', 'While test data contains: ',test.isnull().sum().sum(),
      'Missing values')

Training data contains:  0 Missing values While test data contains:  0 Missing values


## Exploration & Enconding

### Binary features

In [11]:
# fig, ax = plt.subplots(1,5, figsize=(40, 10))
# fig.suptitle('Training Data',fontsize = 30)
# for i in range(5): 
#     sns.countplot(train[binary_features[i]], data= train, ax=ax[i])
#     ax[i].set_ylim([0, len(train)])
# plt.show()

In [10]:
train[binary_features].describe()

Unnamed: 0,bin_0,bin_1,bin_2
count,300000.0,300000.0,300000.0
mean,0.127503,0.256607,0.383223
std,0.333537,0.436761,0.486173
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,1.0,1.0
max,1.0,1.0,1.0


In [11]:
train[binary_features].head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,T,Y
1,0,1,0,T,Y
2,0,0,0,F,Y
3,0,1,0,F,Y
4,0,0,0,F,N


In [18]:
def getBinEncoded(data):
    mapping = {'F':0, 'T':1, 'N':0, 'Y':1}
    data.bin_3 = data.bin_3.map(mapping)
    data.bin_4 = data.bin_4.map(mapping)
    return data 

### Categorical features

In [52]:
train[categorical_features].describe(include='all')

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9
count,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000
unique,3,6,6,6,4,222,522,1220,2215,11981
top,Green,Trapezoid,Lion,Russia,Oboe,f7821e391,d173ac7ca,3a114adea,c720f85ca,163cc60fa
freq,127341,101181,101295,101123,92293,2801,1218,534,299,72


In [42]:
def getHashedFeatures(data,column):
    fh = FeatureHasher(n_features=10,input_type='string')
    hashed_features = fh.fit_transform(data[column])
    hashed_features = hashed_features.toarray()
    data =  pd.DataFrame(hashed_features,columns=[column +'_'+ str(i) for i in range(0,10)])
    return data

In [24]:
cat_features = pd.get_dummies(df[['nom_0','nom_1','nom_2','nom_3','nom_4']])

In [45]:
results = []
for i in features_tobeHashed:
    results.append(getHashedFeatures(df,i))

In [51]:
# pd.concat(results[1],results[2])

results[1].values

array([[ 2., -1., -1., ..., -3.,  0.,  0.],
       [ 1., -2.,  0., ..., -3.,  0., -1.],
       [ 1.,  2.,  0., ..., -1.,  0.,  0.],
       ...,
       [ 3.,  0., -2., ...,  0.,  1.,  0.],
       [ 3.,  0.,  0., ..., -1.,  0.,  2.],
       [ 2.,  0.,  0., ..., -3.,  2.,  1.]])

In [39]:
fh = FeatureHasher(n_features=10,input_type='string')
hashed_features = fh.fit_transform(df.nom_5)
hashed_features = hashed_features.toarray()
data =  pd.DataFrame(hashed_features,columns=['nom_5' +'_'+ str(i) for i in range(0,10)])

In [36]:
data

Unnamed: 0,nom_7_0,nom_7_1,nom_7_2,nom_7_3,nom_7_4,nom_7_5,nom_7_6,nom_7_7,nom_7_8,nom_7_9
0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0
2,6.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0
4,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...
499995,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
499996,1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,4.0
499997,1.0,2.0,0.0,2.0,0.0,0.0,0.0,-1.0,0.0,1.0
499998,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


### Ordinal features

In [14]:
train[ordinal_features].head(10)

Unnamed: 0,ord_1,ord_2,ord_3,ord_4,ord_5
0,Grandmaster,Cold,h,D,kr
1,Grandmaster,Hot,a,A,bF
2,Expert,Lava Hot,h,R,Jc
3,Grandmaster,Boiling Hot,i,D,kW
4,Grandmaster,Freezing,a,R,qP
5,Novice,Freezing,j,E,PZ
6,Grandmaster,Lava Hot,g,P,wy
7,Novice,Lava Hot,j,K,Ed
8,Novice,Boiling Hot,e,V,qo
9,Expert,Freezing,h,Q,CZ


In [15]:
def getReadyOrdinalFeatures(data):
    map_ord1 = {'Novice':1, 'Contributor':2, 'Expert':4, 'Master':5, 'Grandmaster':6}
    map_ord2 = {'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}
    
    data.ord_1 = data.ord_1.map(map_ord1)
    data.ord_2 = data.ord_2.map(map_ord2)
    
    # The ord() function returns an integer representing the Unicode character.
    data.ord_3 = data.ord_3.map(ord)
    data.ord_4 = data.ord_4.map(ord)
    ord_5_coded_1 = data.ord_5.map(lambda st: ord(st[0]))
    ord_5_coded_2 = data.ord_5.map(lambda st: ord(st[1]))
    
    data.ord_5 = ord_5_coded_1 + ord_5_coded_2
    
    return data 

In [16]:
train['ord_5_1'] = train.ord_5.map(lambda string: ord(string[0]), na_action='ignore')
train['ord_5_2'] = train.ord_5.map(lambda string: ord(string[1]), na_action='ignore')

map_ord5 = {key:value for value,key in enumerate(sorted(train.ord_5.unique()))} 
train.ord_5 = train.ord_5.map(map_ord5)

In [19]:
'''Covert dataframe to spare matrix'''
train_ohe = train_ohe.sparse.to_coo().tocsr()
test_ohe = test_ohe.sparse.to_coo().tocsr()
type(train_ohe)

scipy.sparse.csr.csr_matrix

In [68]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
ordinal = encoder.fit_transform(train['ord_1'].values.reshape(-1,1))

In [59]:
cat_features = ['nom_0','nom_1','nom_2','nom_3','nom_4']
ordinal_features = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
binary_features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
features_tobeHashed = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [None]:
counter = pd.DataFrame(train['nom_9'].value_counts())
# pd.options.display.max_rows = None
# len(counter)

In [30]:
tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(train['nom_0'])

In [31]:
text_tfidf.toarray()

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [19]:
# g = sns.FacetGrid(train, col="nom_5", col_wrap=3, height=4.5)
# g = g.map(sns.countplot, "target").add_legend()