In [176]:
import datetime
import gc
import numpy as np
import os
import pandas as pd
import random
import timeit

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import log_loss,confusion_matrix,roc_curve,auc
from sklearn.model_selection import GridSearchCV

20 newsgroups challenge problemi, machine learning açısından yıllarca kullanılan bilindik bir problemdir. Bu problemde 20 adet konu başlığı bulunmaktadır. Bu farklı gazete yazılarının hangi konu başlığına ait olduğunu bulmaya çalışılır. 20 newsgroups ciphertext challenge da ise başlıkta da anlaşılacağı üzere gazete yazıları şifrenlenmiştir. 4 farklı zorluk derecesine göre şifrelenen gazete yazıları sırasıyla 1,2,3,4 zorluk derecesine sahiptir. 1 zorluk derecesindeki yazılar 1 adet şifreleme geçirmiş, 2 zorluk derecesine sahip yazılar ise art arda 2 adet şifreleme geçirmiştir. Diğerleride aynı şekilde şifrelenmişlerdir.

Machine learning alanında dil ile ilgili olan problemlere NLP (Neuro-linguistic programming) denir. Bu tür problemlerde yapılan başlıca adımlar vardır. Bunlardan bazıları şunlardır;

1-)Yazıyı Temizlemek: 
- Alakasız karakterleri yazıdan silmek. Bunlara bazı gereksiz sayılar, noktalama işaretleri veya İngilizcedeki "am", "is", "are" gibi kelimeler dahildir.
- Yazılardaki bütün harfleri küçüük harfe çevirmek çünkü “hello”, “Hello”, ve “HELLO” gibi kelimelerin hepsi aynı şeyi ifade ediyor.

2-)Modellemek için kelimeleri birbirinden ayırmak
- Modellemede kullanmak için yazı içindeki her kelimeyi ayrı bir eleman gibi düşünmek. Buna ingilizcede "Tokenize" deniyor. 

3-)Vectore çevirmek
- Machine learning de kullanılan modeller kelime veya karakter bazlı olmadığı için yazıları sayıya çevirmek gerekiyor. Bunu yapmak içinde kelimeleri veya belli sayıda karakterleri çeşitli fonksiyonlardan geçirerek onların vektor karşılıklarıyla işlem yapılmalıdır. 

4-)Özellik çoğaltma
- NLP problemlerinde genelde yazının yanında herhangi bir özellik verilmez. Vectore çevirme aslında bir nevi yazının özelliğini ortaya çıkartma olarak algılanabilir. Bunların yanında yazıların özelliklerini çıkartmak için python da kullanılan bazı yöntemler vardır. Bu yöntemler fuzzy,Levenshtein gibi yardımcı araçlarla ortaya çıkartılabilir.

4-)Model bulmak
- NLP için sıklıkla kullanılan bazı modelleme yöntemleri vardır. Bunlardan bazıları Naive-Bayes, SVM, Logistic Regression, Ensemble gibi modellemelerdir. Bunların her biri denenip hangisinin daha iyi sonuç veridiği bulunmalıdır.

Adım 1:

"c1|FaAO120O'8ovfoy1W#atvGs1[1s1[1/1]O-a8o1" yazısı zorluk derecesi 1 olan şifrenlenmiş bir yazı örneğidir. Burada ilk olarak herhangi bir temizleme işlemi yapmadan yazılar vectorlere çevrilerek Naive-Bayes, SVM, Logistic Regression, RandomForest ve XGBClassifier modelleri denenerek sonuçlar analiz edilmiştir. Burada verinin büyüklüğü nedeniyle sadece zorluk derecesi 1 olan verilerle çalışılmıştır. 

Vectore çevirmek için ilk başta CountVectorizer fonksiyonu kullanılmıştır. Şifrelenmiş verilerde büyük küçük harf önemli olduğu için büyük harfler küçük harfe çevrilmemiştir. Tokinizer olarak ngram kullanılmıştır. Fonksiyon çıkışında her kelime ayrılıp bir vector olarak ifade edilmiştir. Vectorizer da en iyi parametreleri bulmak için GridSearch yöntemi kullanılmıştır. Burada analyzer parametresi olarak ('word','char'), ngram parametresi olarak birçok değer denemiştir. Bulunan en iyi sonuç ile yazılar vectorlere çevrilmiştir. Daha sonra modellere uygulanarak sırasıyla sonuçlar gözlemlenmiştir.

In [198]:
# DUZ DENEME
df = pd.read_csv('20-newsgroups-ciphertext-challenge/train.csv')
data_1 = df.query('difficulty==1')
X = data_1.iloc[:,-2]
y = data_1.iloc[:,-1]

parameters = {
    #'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
    #'clf__max_iter': (10, 50, 80),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'tfidf__use_idf': (True, False),
    #'tfidf__sublinear_tf': (True, False),
    #'tfidf__lowercase': (True,False),
    #'tfidf__strip_accents': ('ascii','unicode'),
    #'tfidf__analyzer': ('word','char'),
    #'tfidf__ngram_range': ((1,4),(1,5)),
    #'tfidf__max_features': (7500,5000),
    #'svd__n_components': (500,1000),
    #'vect__ngram_range': ((1, 6), (1, 7)),
    #'vect__analyzer': ('word','char'),
    #'vect__min_df': (0.001,0.01),
}

# build TFIDF Vectorizer
tokens= ((u'(?ui)\\b\\w*[a-z]+\\w*\\b')) # makes sure it matches a word but contains at least one letter

nb = Pipeline([('vect', CountVectorizer(analyzer = 'char',ngram_range=(1, 6))),
               #('tfidf', TfidfVectorizer(strip_accents='ascii',analyzer='char',token_pattern=tokens,
               #                          ngram_range=(1,4),lowercase=False,dtype=np.uint32,max_features=7500)),
               #('svd', TruncatedSVD(algorithm='arpack')),
               ('clf', MultinomialNB(alpha=1.0e-10)),])
#grid_search = GridSearchCV(nb, parameters, cv=5,n_jobs=-1, verbose=1)

lr = Pipeline([#('vect', CountVectorizer(analyzer = 'word',ngram_range=(1, 2))),
               ('tfidf', TfidfVectorizer(strip_accents='ascii',analyzer='word',token_pattern=tokens,
                                         ngram_range=(1,2),lowercase=False,dtype=np.float32,max_features=7500)),
               ('clf', LogisticRegression(solver='saga', n_jobs=-1, C=1e5, multi_class='auto')),])
#grid_search = GridSearchCV(lr, parameters, cv=5,n_jobs=-1, verbose=1)

svm = Pipeline([#('vect', CountVectorizer(analyzer = 'char',ngram_range=(1, 6))),
                ('tfidf', TfidfVectorizer(strip_accents='ascii',analyzer='char',token_pattern=tokens,
                                         ngram_range=(1,4),lowercase=False,dtype=np.float32,max_features=7500)),
                ('clf', SGDClassifier(loss='hinge',penalty='l2',alpha=1e-5,random_state=0,max_iter=1000, 
                                     tol=1e-3,n_jobs=-1)),])
#grid_search = GridSearchCV(svm, parameters, cv=5,n_jobs=-1, verbose=1)

rf = Pipeline([#('vect', CountVectorizer(analyzer = 'char',ngram_range=(1, 6))),
               ('tfidf', TfidfVectorizer(strip_accents='ascii',analyzer='char',token_pattern=tokens,
                                         ngram_range=(1,4),lowercase=False,dtype=np.float32,max_features=7500)), 
               ('clf', RandomForestClassifier(n_estimators=500,max_features='log2',min_samples_split=4)),])
#grid_search = GridSearchCV(rf, parameters, cv=5,n_jobs=-1, verbose=1)

xgb = Pipeline([#('vect', CountVectorizer(analyzer = 'char',ngram_range=(1, 6))),
                ('tfidf', TfidfVectorizer(strip_accents='ascii',analyzer='char',token_pattern=tokens,
                                         ngram_range=(1,4),lowercase=False,dtype=np.float32,max_features=7500)),
                ('clf', XGBClassifier()),])
#grid_search = GridSearchCV(xgb, parameters, cv=5,n_jobs=-1, verbose=1)

1 -)Naive-Bayes train score= 0.9938674321503131
1 -)Naive-Bayes test score= 0.6046753246753247
2 -)Naive-Bayes train score= 0.9937426671881111
2 -)Naive-Bayes test score= 0.6235662148070907
3 -)Naive-Bayes train score= 0.9934827945776851
3 -)Naive-Bayes test score= 0.6061554512258738
4 -)Naive-Bayes train score= 0.9930926625830835
4 -)Naive-Bayes test score= 0.6132567849686847
5 -)Naive-Bayes train score= 0.9936164669098488
5 -)Naive-Bayes test score= 0.6121275483533717
Elapsed time 114.89954035500705

1 -)Naive-Bayes train score= 0.6860647181628392
1 -)Naive-Bayes test score= 0.41818181818181815
2 -)Naive-Bayes train score= 0.6782688045887107
2 -)Naive-Bayes test score= 0.4191866527632951
3 -)Naive-Bayes train score= 0.6904327424400417
3 -)Naive-Bayes test score= 0.41366718831507565
4 -)Naive-Bayes train score= 0.6881271992701681
4 -)Naive-Bayes test score= 0.43893528183716074
5 -)Naive-Bayes train score= 0.6827774882751433
5 -)Naive-Bayes test score= 0.42812336644014637
Elapsed time 58.729096541996114

In [195]:
# trigger timer
start_time = timeit.default_timer()
i=1
# create and train model
skf = StratifiedKFold(n_splits=5)
for train, test in skf.split(X, y):
    # Naive-Bayes
    nb.fit(X.iloc[train], y.iloc[train])
    y_pred = nb.predict(X.iloc[test])
    print(i,"-)Naive-Bayes train score=", nb.score(X.iloc[train],y.iloc[train]))
    print(i,"-)Naive-Bayes test score=", accuracy_score(y_pred, y.iloc[test]))  
    i=i+1

#print("Best score: %0.3f" % grid_search.best_score_)
#print("Best parameters set:")
#best_parameters = grid_search.best_estimator_.get_params()
#for param_name in sorted(parameters.keys()):
#    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# calculate time interval
elapsed = timeit.default_timer() - start_time
print("Elapsed time", elapsed)

1 -)Naive-Bayes train score= 0.9821242171189979
1 -)Naive-Bayes test score= 0.49246753246753244
2 -)Naive-Bayes train score= 0.9813583626645809
2 -)Naive-Bayes test score= 0.4984358706986444


KeyboardInterrupt: 

1 -)Logistic Regression Classifier train score= 0.9973387601753287
1 -)Logistic Regression Classifier test score= 0.42018119337706966
2 -)Logistic Regression Classifier train score= 0.9976533166458073
2 -)Logistic Regression Classifier test score= 0.4350954019393181
3 -)Logistic Regression Classifier train score= 0.9970303219756174
3 -)Logistic Regression Classifier test score= 0.4261986837981824
Elapsed time 65.17152920400258

1 -)Logistic Regression Classifier train score= 0.993894802755166
1 -)Logistic Regression Classifier test score= 0.44204935957513275
2 -)Logistic Regression Classifier train score= 0.9937421777221527
2 -)Logistic Regression Classifier test score= 0.44541757898029405
3 -)Logistic Regression Classifier train score= 0.9931228508909034
3 -)Logistic Regression Classifier test score= 0.43685365089313694
Elapsed time 32.350123803000315

In [158]:
# trigger timer
start_time = timeit.default_timer()
i=1
# create and train model
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    # Logistic Regression Classifier
    lr.fit(X.iloc[train], y.iloc[train])
    y_pred = lr.predict(X.iloc[test])
    print(i,"-)Logistic Regression Classifier train score=", lr.score(X.iloc[train],y.iloc[train]))
    print(i,"-)Logistic Regression Classifier test score=", accuracy_score(y_pred, y.iloc[test]))
    i=i+1
    
# calculate time interval
elapsed = timeit.default_timer() - start_time
print("Elapsed time", elapsed)



1 -)Logistic Regression Classifier train score= 0.993894802755166
1 -)Logistic Regression Classifier test score= 0.44204935957513275
2 -)Logistic Regression Classifier train score= 0.9937421777221527
2 -)Logistic Regression Classifier test score= 0.44541757898029405
3 -)Logistic Regression Classifier train score= 0.9931228508909034
3 -)Logistic Regression Classifier test score= 0.43685365089313694
Elapsed time 32.350123803000315


1 -)SVM train score= 0.9616468378209142
1 -)SVM test score= 0.5029678225554515
2 -)SVM train score= 0.9586983729662077
2 -)SVM test score= 0.5176728182671254
3 -)SVM train score= 0.9477961863082213
3 -)SVM test score= 0.4992165465371357
Elapsed time 141.1674568860035

1 -)SVM train score= 0.9575767063243582
1 -)SVM test score= 0.5301468291159013
2 -)SVM train score= 0.949468085106383
2 -)SVM test score= 0.5317485142320926
3 -)SVM train score= 0.9554548296342608
3 -)SVM test score= 0.5352554058288937
Elapsed time 43.7149073959954

In [159]:
# trigger timer
start_time = timeit.default_timer()
i=1
# create and train model
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    # SGDClassifier
    svm.fit(X.iloc[train], y.iloc[train])
    y_pred = svm.predict(X.iloc[test])
    print(i,"-)SVM train score=", svm.score(X.iloc[train],y.iloc[train]))
    print(i,"-)SVM test score=", accuracy_score(y_pred, y.iloc[test]))
    i=i+1
    
# calculate time interval
elapsed = timeit.default_timer() - start_time
print("Elapsed time", elapsed)

1 -)SVM train score= 0.9575767063243582
1 -)SVM test score= 0.5301468291159013
2 -)SVM train score= 0.949468085106383
2 -)SVM test score= 0.5317485142320926
3 -)SVM train score= 0.9554548296342608
3 -)SVM test score= 0.5352554058288937
Elapsed time 43.7149073959954


1 -)Random Forest train score= 0.9993738259236068
1 -)Random Forest test score= 0.4957825679475164
2 -)Random Forest train score= 0.9992177722152691
2 -)Random Forest test score= 0.5032843290584923
3 -)Random Forest train score= 0.9990622069396686
3 -)Random Forest test score= 0.5155123785647132
Elapsed time 514.6145224819993

1 -)Random Forest train score= 0.9996869129618033
1 -)Random Forest test score= 0.4126835363948766
2 -)Random Forest train score= 0.9998435544430538
2 -)Random Forest test score= 0.4119487019080388
3 -)Random Forest train score= 0.9996874023132228
3 -)Random Forest test score= 0.4202444374804137
Elapsed time 131.42851219599834

In [160]:
# trigger timer
start_time = timeit.default_timer()
i=1
# create and train model
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    # Random Forest
    rf.fit(X.iloc[train], y.iloc[train])
    y_pred = rf.predict(X.iloc[test])
    print(i,"-)Random Forest train score=", rf.score(X.iloc[train],y.iloc[train]))
    print(i,"-)Random Forest test score=", accuracy_score(y_pred, y.iloc[test]))
    i=i+1
    
# calculate time interval
elapsed = timeit.default_timer() - start_time
print("Elapsed time", elapsed)

1 -)Random Forest train score= 0.9996869129618033
1 -)Random Forest test score= 0.4126835363948766
2 -)Random Forest train score= 0.9998435544430538
2 -)Random Forest test score= 0.4119487019080388
3 -)Random Forest train score= 0.9996874023132228
3 -)Random Forest test score= 0.4202444374804137
Elapsed time 131.42851219599834


In [128]:
# trigger timer
start_time = timeit.default_timer()
i=1
# create and train model
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    # XGBClassifier
    xgb.fit(X.iloc[train], y.iloc[train])
    y_pred = xgb.predict(X.iloc[test])
    print(i,"-)XGBClassifier train score=", xgb.score(X.iloc[train],y.iloc[train]))
    print(i,"-)XGBClassifier test score=", accuracy_score(y_pred, y.iloc[test]))
    i=i+1
    
# calculate time interval
elapsed = timeit.default_timer() - start_time
print("Elapsed time", elapsed)

KeyboardInterrupt: 

In [199]:
X = X.astype(str)

def tokenize(text): 
    return text.split("1")

def trimm(text):
    return ' '.join([i for i in text if len(i) > 3])

token_data = [tokenize(i) for i in X]
X = [trimm(i) for i in token_data]

# build TFIDF Vectorizer
tokens= ((u'(?ui)\\b\\w*[a-z]+\\w*\\b')) # makes sure it matches a word but contains at least one letter

word_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='ascii',analyzer='word',token_pattern=tokens,
                                  ngram_range=(1,2),dtype=np.float32,max_features=7500)

# Character Stemmer
char_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='ascii',analyzer='char',token_pattern=tokens,
                                  ngram_range=(2, 4),dtype=np.float32,max_features=12000)

word_vectorizer.fit(X)
char_vectorizer.fit(X)

train_word_features = word_vectorizer.transform(X)
train_char_features = char_vectorizer.transform(X)

train_features = hstack([train_char_features,train_word_features])

%time
print("Modeling..")

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(train_features, y, test_size=0.33)

lr = LogisticRegression(solver="sag", max_iter=100,class_weight='balanced',C=2.65,penalty='l2')
lr.fit(train_features,y)
lr_pred=lr.predict(X_test_tfidf)

accuracy_tfidf =accuracy_score(y_test_tfidf,lr_pred)

print(accuracy_tfidf)
print(classification_report(y_test_tfidf,lr_pred))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs
Modeling..
0.9311216429699842


NameError: name 'cross_val_score' is not defined

In [79]:
# load the dataframe
df = pd.read_csv('20-newsgroups-ciphertext-challenge/train.csv')
#test = pd.read_csv('20-newsgroups-ciphertext-challenge/test.csv')

# dataframe for only difficulty=1
data1 = df.query('difficulty==1')
# how many elements do have diffuculty class
print(df['difficulty'].value_counts()) 
# count repeated chars in ciphertext class
alp = pd.Series(Counter(''.join(data1['ciphertext'])))  
print(pd.crosstab(df['difficulty'], df['target']))
alp.head(10)

2    10024
4     9970
1     9589
3     9469
Name: difficulty, dtype: int64
target       0    1    2    3    4    5    6    7    8    9    10   11   12  \
difficulty                                                                    
1           420  465  360  346  320  466  361  331  380  486  540  563  344   
2           455  528  652  391  315  550  310  405  387  301  580  651  396   
3           394  567  695  386  313  524  293  422  331  460  479  534  338   
4           465  366  940  382  326  653  272  437  386  413  437  622  429   

target       13   14   15   16   17   18   19  
difficulty                                     
1           482  576  765  471  834  675  404  
2           500  566  692  536  912  547  350  
3           593  470  513  506  714  553  384  
4           599  463  582  589  693  539  377  


c     53752
1    505426
|      7490
F      6134
a    121319
A     58440
O    134813
2      8502
0     75960
'     34490
dtype: int64

In [None]:
def tokenize(text): 
    return text.split("1")

def trimm(text):
    return ' '.join([i for i in text if len(i) > 1])

token_data = [tokenize(i) for i in X]
X = [trimm(i) for i in token_data]

In [11]:
# PCA 
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

a=cosine_similarity(train_word_features, train_word_features)

pca = PCA(n_components=10)

principalComponents = pca.fit_transform(a)

principalDf = pd.DataFrame(data = principalComponents)
print(principalDf.shape)
#finalDf = pd.concat([principalDf, y], axis = 1)
#finalDf

(9589, 10)


In [1]:
# Feature extraction
import datetime
import gc
import numpy as np
import os
import pandas as pd
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import skew, kurtosis
import lightgbm as lgb

import Levenshtein
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

from tqdm import tqdm

'''
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])
xtrain_count =  count_vect.transform(train_x)

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)

ensemble.RandomForestClassifier()

diff1['ciphertext'] = diff1['ciphertext'].apply(lambda x: x.replace('1', ' '))
diff2['ciphertext'] = diff2['ciphertext'].apply(lambda x: x.replace('8', ' '))
diff3['ciphertext'] = diff3['ciphertext'].apply(lambda x: x.replace('8', ' '))
diff4['ciphertext'] = diff4['ciphertext'].apply(lambda x: x.replace('8', ' '))
'''

def extract_features(df):
    #df['nunique'] = df['ciphertext'].apply(lambda x: len(np.unique(x)))
    #df['len'] = df['ciphertext'].apply(lambda x: len(x))

    def count_chars(x):
        n_l = 0 # count letters
        n_n = 0 # count numbers
        n_s = 0 # count symbols
        n_ul = 0 # count upper letters
        n_ll = 0 # count lower letters
        for i in range(0, len(x)):
            if x[i].isalpha():
                n_l += 1
                if x[i].isupper():
                    n_ul += 1
                elif x[i].islower():
                    n_ll += 1
            elif x[i].isdigit():
                n_n += 1
            else:
                n_s += 1

        return pd.Series([n_l, n_n, n_s, n_ul, n_ll])

    cols = ['n_l', 'n_n', 'n_s', 'n_ul', 'n_ll']
    for c in cols:
        df[c] = 0
    tqdm.pandas(desc='count_chars')
    df[cols] = df['ciphertext'].progress_apply(lambda x: count_chars(x))
    for c in cols:
        df[c] /= df['len']

    tqdm.pandas(desc='distances')
    df['Levenshtein_distance'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.distance(x, x[::-1]))
    df['Levenshtein_ratio'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.ratio(x, x[::-1]))
    df['Levenshtein_jaro'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.jaro(x, x[::-1]))
    df['Levenshtein_hamming'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.hamming(x, x[::-1]))

    for m in range(1, 5):
        df['Levenshtein_distance_m{}'.format(m)] = df['ciphertext'].progress_apply(lambda x: Levenshtein.distance(x[:-m], x[m:]))
        df['Levenshtein_ratio_m{}'.format(m)] = df['ciphertext'].progress_apply(lambda x: Levenshtein.ratio(x[:-m], x[m:]))
        df['Levenshtein_jaro_m{}'.format(m)] = df['ciphertext'].progress_apply(lambda x: Levenshtein.jaro(x[:-m], x[m:]))
        df['Levenshtein_hamming_m{}'.format(m)] = df['ciphertext'].progress_apply(lambda x: Levenshtein.hamming(x[:-m], x[m:]))
    
    df['Levenshtein_distance_h'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.distance(x[:len(x)//2], x[len(x)//2:]))
    df['Levenshtein_ratio_h'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.ratio(x[:len(x)//2], x[len(x)//2:]))
    df['Levenshtein_jaro_h'] = df['ciphertext'].progress_apply(lambda x: Levenshtein.jaro(x[:len(x)//2], x[len(x)//2:]))
    
    # All symbols stats
    def strstat(x):
        r = np.array([ord(c) for c in x])
        return pd.Series([
            np.sum(r), 
            np.mean(r), 
            np.std(r), 
            np.min(r), 
            np.max(r),
            skew(r), 
            kurtosis(r),
            ])
    cols = ['str_sum', 'str_mean', 'str_std', 'str_min', 'str_max', 'str_skew', 'str_kurtosis']
    for c in cols:
        df[c] = 0
    tqdm.pandas(desc='strstat')
    df[cols] = df['ciphertext'].progress_apply(lambda x: strstat(x))
    
    # Digit stats
    def str_digit_stat(x):
        r = np.array([ord(c) for c in x if c.isdigit()])
        if len(r) == 0:
            r = np.array([0])
        return pd.Series([
            np.sum(r), 
            np.mean(r), 
            np.std(r), 
            np.min(r), 
            np.max(r),
            skew(r), 
            kurtosis(r),
            ])
    cols = ['str_digit_sum', 'str_digit_mean', 'str_digit_std', 'str_digit_min', 
        'str_digit_max', 'str_digit_skew', 'str_digit_kurtosis']
    for c in cols:
        df[c] = 0
    tqdm.pandas(desc='str_digit_stat')
    df[cols] = df['ciphertext'].progress_apply(lambda x: str_digit_stat(x))

print('Extracting features for train:')
extract_features(train)
train.head()

Extracting features for train:


count_chars: 100%|██████████| 39052/39052 [00:09<00:00, 4323.13it/s]
distances: 100%|██████████| 39052/39052 [00:06<00:00, 5994.40it/s]
distances: 100%|██████████| 39052/39052 [00:05<00:00, 7667.39it/s]
distances: 100%|██████████| 39052/39052 [00:01<00:00, 32674.29it/s]
distances: 100%|██████████| 39052/39052 [00:00<00:00, 339877.12it/s]
distances: 100%|██████████| 39052/39052 [00:06<00:00, 5800.36it/s]
distances: 100%|██████████| 39052/39052 [00:04<00:00, 7963.75it/s]
distances: 100%|██████████| 39052/39052 [00:01<00:00, 32945.49it/s]
distances: 100%|██████████| 39052/39052 [00:00<00:00, 594070.61it/s]
distances: 100%|██████████| 39052/39052 [00:07<00:00, 5549.92it/s]
distances: 100%|██████████| 39052/39052 [00:04<00:00, 8632.10it/s]
distances: 100%|██████████| 39052/39052 [00:01<00:00, 34764.83it/s]
distances: 100%|██████████| 39052/39052 [00:00<00:00, 613254.41it/s]
distances: 100%|██████████| 39052/39052 [00:06<00:00, 6188.96it/s]
distances: 100%|██████████| 39052/39052 [00:04<00:0

Unnamed: 0,Id,difficulty,ciphertext,target,nunique,len,n_l,n_n,n_s,n_ul,...,str_max,str_skew,str_kurtosis,str_digit_sum,str_digit_mean,str_digit_std,str_digit_min,str_digit_max,str_digit_skew,str_digit_kurtosis
0,ID_88b9bbd73,4,"ob|IK?zzhX*L{83B3Z,FuL*Pusm$83L\t@r$$*38,8s...",10,1,300,0.473333,0.17,0.356667,0.246667,...,127.0,-0.09165,-0.950683,2664.0,52.235294,3.299366,48.0,57.0,-0.001394,-1.549887
1,ID_f489bd59f,1,c1|FaAO120O'8ovfoy1W#atvGs1[1s1[1/1]O-a8o1-...,13,1,300,0.383333,0.24,0.376667,0.103333,...,124.0,-0.047954,-1.14433,3568.0,49.555556,2.146631,48.0,56.0,2.564841,4.938975
2,ID_f90fee9c7,2,1*e4N8$f$0ccOuihkHek$k*V*hoeV$Hj8VhH8...,19,1,300,0.43,0.146667,0.423333,0.176667,...,127.0,-0.063138,-1.231967,2406.0,54.681818,2.274091,48.0,57.0,-1.587334,1.29993
3,ID_8303ced65,1,O8v^10O#to1'#^'^tv1^]s111t01Otaq>-ata_1...,17,1,300,0.42,0.246667,0.333333,0.153333,...,125.0,-0.077411,-0.983794,3682.0,49.756757,2.276857,48.0,56.0,2.245122,3.317398
4,ID_72abc2cb7,2,eV}H}khfe4b8'S.Vc}{A .#VikV.fV?{$f7$Hjb8...,0,1,300,0.433333,0.113333,0.453333,0.133333,...,126.0,-0.241807,-1.200268,1875.0,55.147059,1.477923,51.0,56.0,-1.675256,1.324605
