In [1]:
%matplotlib inline
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns
from __future__ import division

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.model_selection import cross_val_score , StratifiedKFold

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df.info()

In [4]:
df = df.drop_duplicates(subset=['Word'], keep=False)

In [5]:
all_data = pd.concat([df, df_test])

In [None]:
all_data.head()

In [6]:
bigram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,5), max_features = 70, encoding='utf8')

for_tf_idf = bigram_vectorizer.fit_transform(all_data.Word.apply(str.lower).tolist())
tfidf_sparse = TfidfTransformer().fit_transform(for_tf_idf)

In [7]:
tfidf = pd.DataFrame(tfidf_sparse.toarray() 
                     , index=all_data.index, columns=['tf_idf_' + str(i) for i in range(tfidf_sparse.shape[1])])

In [None]:
tfidf

In [None]:
all_data

In [26]:
all_data = pd.concat([all_data, tfidf], axis=1)
#df_test = pd.concat([df_test, tfidf.iloc[df_test.index].Word], axis=1)

In [8]:
all_data['fl_is_ip'] = all_data.Word.apply(lambda x: x[0].isupper()).map({False: 0, True: 1})
#df_test['fl_is_ip'] = df_test.Word.apply(lambda x: x[0].isupper()).map({False: 0, True: 1})

In [9]:
all_data['cnt_up'] = all_data.Word.apply(lambda x: sum(1 if x[i].isupper() else 0 for i in range(len(x))))

In [None]:
all_data

In [10]:
def is_upcase(word):
    if len(word) > 1:
        if word[1] == word[1].upper():
            return 1
    return 0

In [11]:
all_data['sl_is_ip'] = all_data.Word.map(is_upcase)
#df_test['sl_is_ip'] = df_test.Word.map(is_upcase)

In [12]:
all_data['title'] = all_data.Word.apply(lambda x: x.istitle()).map({False: 0, True: 1})
#df_test['title'] = df_test.Word.apply(lambda x: x.istitle()).map({False: 0, True: 1})

In [13]:
def get_syllables(line):
    line.rstrip()
    vowel_list = ('А', 'Е', 'Ё', 'И', 'О', 'У', 'Ы', 'Э', 'Ю', 'Я')
    k = 0 #счётчик гласных в слове
    #считаем количество гласных в слове
    for symbol in line:
        if vowel_list.__contains__(symbol.upper()):
            k+=1
        #Добавляем полученное число в список
    return k

In [14]:
all_data['slog'] = all_data.Word.map(get_syllables)
#df_test['slog'] = df_test.Word.map(get_syllables)

In [15]:
all_data['lastsym'] = all_data.Word.apply(lambda x: ord(x[-1]))
#df_test['lastsym'] = df_test['Word'].apply(lambda x: ord(x[-1]))

In [25]:
all_data['prelastsym'] = all_data.Word.apply(lambda x: ord(x[-2]) if len(x)>1 else 100)
#df_test['prelastsym'] = df_test['Word'].apply(lambda x: ord(x[-2]) if len(x)>1 else -1)

In [None]:
#!pip install natasha

In [17]:
from natasha import NamesExtractor
extractor = NamesExtractor()

def has_name(text):
    matches = extractor(text)
    return 0 if matches.as_json==[] else 1

In [18]:
vowels = ['а',  'я', 'ё', 'у','е', 'о', 'э', 'ю', 'и', 'ы', 'Ё', 'У', 'Е', 'Ы','А', 'О', 'Э', 'Ю', 'И', 'Я']

In [19]:
def prepare_features(df):
    df['Lenght'] = df['Word'].apply(lambda x: len(x))
    df['Vowels'] = df['Word'].apply(lambda x: sum(1 if l in vowels else 0 for l in x))
    df['Consonants'] = df['Lenght'] - df['Vowels']
    df['Not_null_Consonants'] = df['Consonants'].apply(lambda x: 0.001 if x==0 else x)
    df['Vow/Conson'] = df['Vowels'] / df['Not_null_Consonants']
    df = df.drop(columns=['Not_null_Consonants'])
    df['is_lower'] = df['Word'].apply(lambda x: 1 if x[0] == x[0].lower() else 0)
    #df['has_name'] = df['Word'].apply(lambda word: has_name(word))
    return df

In [20]:
all_data = prepare_features(all_data)
#test_all = prepare_features(df_test)

In [None]:
#nltk.download('punkt')

In [21]:
import nltk
import pymorphy2

# probability score threshold
prob_thresh = 0.4

morph = pymorphy2.MorphAnalyzer()

In [22]:
def is_name_pymorphy(text):
    for word in nltk.word_tokenize(text):
        for p in morph.parse(word):
            if 'Name' in p.tag and p.score >= prob_thresh:
                return p.score
            else:
                return 100

In [23]:
all_data['mrph'] = all_data.Word.apply(lambda x: is_name_pymorphy(x))
#df_test['mrph'] = df_test.Word.apply(lambda x: is_name_pymorphy(x))

In [24]:
all_data['has_name'] = all_data['Word'].apply(lambda word: has_name(word))
#df_test['has_name'] = df_test['Word'].apply(lambda word: has_name(word))

In [None]:
all_data.head()

In [27]:
train_columns = set(all_data.columns)
columns_transformed = set(('Word',))

target_column = set(('Label',))
train_columns -= columns_transformed
train_columns -= target_column
train_columns

{'Consonants',
 'Lenght',
 'Vow/Conson',
 'Vowels',
 'cnt_up',
 'fl_is_ip',
 'has_name',
 'is_lower',
 'lastsym',
 'mrph',
 'prelastsym',
 'sl_is_ip',
 'slog',
 'tf_idf_0',
 'tf_idf_1',
 'tf_idf_10',
 'tf_idf_11',
 'tf_idf_12',
 'tf_idf_13',
 'tf_idf_14',
 'tf_idf_15',
 'tf_idf_16',
 'tf_idf_17',
 'tf_idf_18',
 'tf_idf_19',
 'tf_idf_2',
 'tf_idf_20',
 'tf_idf_21',
 'tf_idf_22',
 'tf_idf_23',
 'tf_idf_24',
 'tf_idf_25',
 'tf_idf_26',
 'tf_idf_27',
 'tf_idf_28',
 'tf_idf_29',
 'tf_idf_3',
 'tf_idf_30',
 'tf_idf_31',
 'tf_idf_32',
 'tf_idf_33',
 'tf_idf_34',
 'tf_idf_35',
 'tf_idf_36',
 'tf_idf_37',
 'tf_idf_38',
 'tf_idf_39',
 'tf_idf_4',
 'tf_idf_40',
 'tf_idf_41',
 'tf_idf_42',
 'tf_idf_43',
 'tf_idf_44',
 'tf_idf_45',
 'tf_idf_46',
 'tf_idf_47',
 'tf_idf_48',
 'tf_idf_49',
 'tf_idf_5',
 'tf_idf_50',
 'tf_idf_51',
 'tf_idf_52',
 'tf_idf_53',
 'tf_idf_54',
 'tf_idf_55',
 'tf_idf_56',
 'tf_idf_57',
 'tf_idf_58',
 'tf_idf_59',
 'tf_idf_6',
 'tf_idf_60',
 'tf_idf_61',
 'tf_idf_62',
 'tf_id

In [28]:
train_columns = sorted(list(train_columns))

In [29]:
new_train = all_data[all_data['Label'].notnull()]
new_test = all_data[all_data['Label'].isnull()]

In [None]:
X_test = imputer.transform(new_test[train_columns])
X_test_scaled = scaler.transform(X_test)

In [30]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')
X_train = imputer.fit_transform(new_train[train_columns])
X_test = imputer.transform(new_test[train_columns])

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
#X = df[train_columns]
y = new_train['Label']

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=24)
RF.fit(X_train_scaled, y)
RF.oob_score_

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

print( cross_val_score(RF, X=X_train_scaled, y=y, scoring='roc_auc'))

In [None]:
probas = RF.predict_proba(X_test_scaled)[:,1]

In [None]:
RF.pickle()

In [None]:
df_to_save = pd.DataFrame(data={"Id":df_test.index,"Prediction":probas}) 
df_to_save.to_csv("submission.csv", sep=',' , index=False)

In [None]:
from sklearn.externals import joblib
joblib.dump(RF, 'filename.pkl') 

In [None]:
import xgboost
xr = xgboost.XGBRegressor()
xr.fit(X_train_scaled, y)
#xr.oob_score_

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
print( cross_val_score(xr, X=X_train_scaled, y=y, scoring='roc_auc'))

In [None]:
probasxr = xr.predict(X_test_scaled)
#probas = RF.predict_proba(X_test_scaled)[:,1]

In [None]:
probasxr

In [34]:
from xgboost import XGBClassifier
def validate(x , y):
    model = XGBClassifier(max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

In [None]:
validate(X_train_scaled, y)

In [35]:
model = XGBClassifier(max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
model.fit(X_train_scaled , y)
#sample['Prediction'] = model.predict_proba(X_train_scaled)[:,0]
#sample.to_csv('submit.csv' , index=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
       colsample_bytree=0.9, gamma=0, learning_rate=0.09, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=670,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
probasxr = model.predict_proba(X_test_scaled)

In [38]:
df_to_save = pd.DataFrame(data={"Id":new_test.index,"Prediction":probasxr[:,0]}) 
df_to_save.to_csv("submission.csv", sep=',' , index=False)