In [76]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('passwords.csv')

In [12]:
import pandas as pd

def password_features(pwd):
    return {
        'length': len(pwd),
        'digits': sum(c.isdigit() for c in pwd),
        'uppers': sum(c.isupper() for c in pwd),
        'lowers': sum(c.islower() for c in pwd),
        'specials': sum(not c.isalnum() for c in pwd),
        'has_upper': int(any(c.isupper() for c in pwd)),
        'has_special': int(any(not c.isalnum() for c in pwd)),
    }

# Применим ко всем
features_df = pd.DataFrame(data['password'].apply(password_features).tolist())

In [21]:
data.shape

(100000, 2)

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# загрузка
X_text = data['password']
X_feat = features_df  # ручные признаки: длина, число цифр и т.п.
y = data['strength']

# Разделение данных
X_text_train, X_text_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
    X_text, X_feat, y, test_size=0.2, stratify=y, random_state=42
)

# TF-IDF: n-граммы символов
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 4), max_features=5000)
X_text_train_vec = tfidf.fit_transform(X_text_train)
X_text_test_vec = tfidf.transform(X_text_test)

# Масштабируем ручные признаки
scaler = StandardScaler()
X_feat_train_scaled = scaler.fit_transform(X_feat_train)
X_feat_test_scaled = scaler.transform(X_feat_test)

# Объединяем TF-IDF и ручные признаки
X_train_combined = hstack([X_text_train_vec, X_feat_train_scaled])
X_test_combined = hstack([X_text_test_vec, X_feat_test_scaled])

# XGBoost модель
xgb_model = XGBClassifier(
    objective='multi:softmax',  
    num_class=3,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    n_jobs=-1,
    eval_metric='mlogloss',
    #use_label_encoder=False,
    random_state=42
)

# Обучение
xgb_model.fit(X_train_combined, y_train)

# Предсказание
y_pred = xgb_model.predict(X_test_combined)

# Оценка
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2686
           1       1.00      1.00      1.00     14855
           2       1.00      1.00      1.00      2459

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [25]:
from sklearn.metrics import accuracy_score

# Предсказания на тренировке
y_train_pred = xgb_model.predict(X_train_combined)
train_acc = accuracy_score(y_train, y_train_pred)

# Предсказания на тесте
y_test_pred = xgb_model.predict(X_test_combined)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")


Train accuracy: 1.000
Test accuracy: 1.000


In [27]:
set_train = set(X_text_train)
set_test = set(X_text_test)
intersection = set_train.intersection(set_test)
print("Пересекающиеся пароли:", len(intersection))


Пересекающиеся пароли: 0


In [29]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from xgboost import XGBClassifier

# Кастомный трансформер, объединяющий TF-IDF и ручные признаки
class CombinedFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,4), max_features=5000)
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        # X — массив из двух столбцов: пароли и признаки
        X_text = X[:, 0]
        X_feat = np.vstack(X[:, 1])  # из массива объектов в матрицу
        self.tfidf.fit(X_text)
        self.scaler.fit(X_feat)
        return self
    
    def transform(self, X):
        X_text = X[:, 0]
        X_feat = np.vstack(X[:, 1])
        X_text_vec = self.tfidf.transform(X_text)
        X_feat_scaled = self.scaler.transform(X_feat)
        return hstack([X_text_vec, X_feat_scaled])

# Объединяем исходные данные в один массив (для Pipeline)
X_combined = np.array(list(zip(X_text, X_feat.values)), dtype=object)

# Модель XGBoost
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    n_jobs=-1,
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
)

# Создаём Pipeline
pipeline = Pipeline([
    ('features', CombinedFeaturesTransformer()),
    ('clf', xgb_model)
])

# Кросс-валидация (5 фолдов)
scores = cross_val_score(pipeline, X_combined, y, cv=5, scoring='accuracy', n_jobs=-1)

print("Кросс-валидация accuracy по фолдам:", scores)
print("Средняя accuracy:", scores.mean())
print("Стандартное отклонение:", scores.std())


Кросс-валидация accuracy по фолдам: [1. 1. 1. 1. 1.]
Средняя accuracy: 1.0
Стандартное отклонение: 0.0


In [78]:
# Проверка
new_passwords = [
    'MyNewP@ssw0rd123',
    '123456',
    'password',
    'Qwerty!@#',
]

# Функция для вычисления ручных признаков
def extract_manual_features(passwords):
    features = []
    for pwd in passwords:
        f = password_features(pwd)
        features.append([
            f['length'],
            f['digits'],
            f['uppers'],
            f['lowers'],
            f['specials'],
            f['has_upper'],
            f['has_special']
        ])
    return np.array(features)

# Преобразуем пароли в признаки
new_features = extract_manual_features(new_passwords)

# Преобразуем с помощью tfidf и scaler
new_text_vec = tfidf.transform(new_passwords)
new_feat_scaled = scaler.transform(new_features)

# Объединяем признаки
new_combined = hstack([new_text_vec, new_feat_scaled])

# Предсказываем класс
predictions = xgb_model.predict(new_combined)

# Для вероятностей (если нужна вероятность для каждого класса)
probabilities = xgb_model.predict_proba(new_combined)

# Отобразим результаты
quality_map = {0: 'слабый', 1: 'средний', 2: 'сильный'}

for pwd, pred, prob in zip(new_passwords, predictions, probabilities):
    print(f"Пароль: {pwd}")
    print(f"Класс: {quality_map[pred]}")
    print(f"Вероятности по классам: {prob}")
    print('---')


Пароль: MyNewP@ssw0rd123
Класс: сильный
Вероятности по классам: [2.9556475e-05 4.3769745e-05 9.9992669e-01]
---
Пароль: 123456
Класс: слабый
Вероятности по классам: [9.9992692e-01 4.4119235e-05 2.8983539e-05]
---
Пароль: password
Класс: средний
Вероятности по классам: [2.4809564e-05 9.9995041e-01 2.4772489e-05]
---
Пароль: Qwerty!@#
Класс: средний
Вероятности по классам: [2.4607960e-05 9.9995089e-01 2.4571189e-05]
---


In [80]:
import numpy as np

def extract_manual_features(passwords):
    features = []
    for pwd in passwords:
        f = password_features(pwd)
        features.append([
            f['length'],
            f['digits'],
            f['uppers'],
            f['lowers'],
            f['specials'],
            f['has_upper'],
            f['has_special']
        ])
    return np.array(features)


In [82]:
# Обучение
xgb_model.fit(X_train_combined, y_train)

In [84]:
new_passwords = ["1qrftgSeda!5Nz"]
new_text_vec = tfidf.transform(new_passwords)
new_features = extract_manual_features(new_passwords)
new_features_scaled = scaler.transform(new_features)
new_combined = hstack([new_text_vec, new_features_scaled])

predictions = xgb_model.predict(new_combined)
probs = xgb_model.predict_proba(new_combined)

for pwd, pred, prob in zip(new_passwords, predictions, probs):
    print(f"Пароль: {pwd}\nПредсказанный класс: {pred}\nВероятности: {prob}\n")


Пароль: 1qrftgSeda!5Nz
Предсказанный класс: 2
Вероятности: [2.9556475e-05 4.3769745e-05 9.9992669e-01]



In [86]:
import pandas as pd

def password_features(pwd):
    return {
        'length': len(pwd),
        'digits': sum(c.isdigit() for c in pwd),
        'uppers': sum(c.isupper() for c in pwd),
        'lowers': sum(c.islower() for c in pwd),
        'specials': sum(not c.isalnum() for c in pwd),
        'has_upper': int(any(c.isupper() for c in pwd)),
        'has_special': int(any(not c.isalnum() for c in pwd)),
    }

# Применим ко всем
features_df = pd.DataFrame(data['password'].apply(password_features).tolist())

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Пример: test
X_text = data['password']
X_feat = features_df  # ручные признаки: длина, число цифр и т.п.
y = data['strength']

# Разделение данных
X_text_train, X_text_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
    X_text, X_feat, y, test_size=0.2, stratify=y, random_state=42
)

# TF-IDF: n-граммы символов
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 4), max_features=5000)
X_text_train_vec = tfidf.fit_transform(X_text_train)
X_text_test_vec = tfidf.transform(X_text_test)

# Масштабируем ручные признаки
scaler = StandardScaler()
X_feat_train_scaled = scaler.fit_transform(X_feat_train)
X_feat_test_scaled = scaler.transform(X_feat_test)

# Объединяем TF-IDF и ручные признаки
X_train_combined = hstack([X_text_train_vec, X_feat_train_scaled])
X_test_combined = hstack([X_text_test_vec, X_feat_test_scaled])

# XGBoost модель
xgb_model = XGBClassifier(
    objective='multi:softmax',  # если нужны вероятности, можно использовать 'multi:softprob'
    num_class=3,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    n_jobs=-1,
    eval_metric='mlogloss',
    #use_label_encoder=False,
    random_state=42
)

# Обучение
xgb_model.fit(X_train_combined, y_train)

# Предсказание
y_pred = xgb_model.predict(X_test_combined)

# Оценка
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2686
           1       1.00      1.00      1.00     14855
           2       1.00      1.00      1.00      2459

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

