In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [2]:
data= pd.read_csv('2dataset.csv')


In [3]:
# Пользовательский kNN классификатор
class CustomKNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
    
    def _distance(self, x1, x2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        else:
            raise ValueError("Поддерживаются только 'euclidean' и 'manhattan'")
    
    def predict(self, X):
        X = np.array(X)
        predictions = []
        for x in X:
            distances = [self._distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            most_common = Counter(k_nearest_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return np.array(predictions)

In [4]:
# Пользовательские функции для метрик
def confusion_matrix_custom(y_true, y_pred):
    TP = sum((t == 1 and p == 1) for t, p in zip(y_true, y_pred))
    TN = sum((t == 0 and p == 0) for t, p in zip(y_true, y_pred))
    FP = sum((t == 0 and p == 1) for t, p in zip(y_true, y_pred))
    FN = sum((t == 1 and p == 0) for t, p in zip(y_true, y_pred))
    return [[TN, FP], [FN, TP]]

def accuracy_custom(y_true, y_pred):
    correct = sum(t == p for t, p in zip(y_true, y_pred))
    total = len(y_true)
    return correct / total if total > 0 else 0

def precision_custom(y_true, y_pred):
    cm = confusion_matrix_custom(y_true, y_pred)
    TP = cm[1][1]
    FP = cm[0][1]
    return TP / (TP + FP) if (TP + FP) > 0 else 0

def recall_custom(y_true, y_pred):
    cm = confusion_matrix_custom(y_true, y_pred)
    TP = cm[1][1]
    FN = cm[1][0]
    return TP / (TP + FN) if (TP + FN) > 0 else 0

def f1_custom(y_true, y_pred):
    precision = precision_custom(y_true, y_pred)
    recall = recall_custom(y_true, y_pred)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

In [5]:
# Подготовка данных
X = data.drop(columns=['RainTomorrow', 'Date', 'Location'])
y = data['RainTomorrow'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Балансировка классов
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Классификаторы
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Custom kNN (k=5)': CustomKNN(k=5, distance_metric='euclidean'),
    'Naive Bayes': GaussianNB(),
    'SVM Linear': SVC(kernel='linear', probability=True, random_state=42),
    'SVM Polynomial': SVC(kernel='poly', degree=3, probability=True, random_state=42),
    'SVM RBF': SVC(kernel='rbf', probability=True, random_state=42),
    'SVM Sigmoid': SVC(kernel='sigmoid', probability=True, random_state=42)
}

# Оценка метрик и сбор данных для ROC
results = {}
roc_data = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train_balanced)
    y_pred = clf.predict(X_test_scaled)
    
    # Пользовательские метрики
    results[name] = {
        'Accuracy': accuracy_custom(y_test, y_pred),
        'Precision': precision_custom(y_test, y_pred),
        'Recall': recall_custom(y_test, y_pred),
        'F1': f1_custom(y_test, y_pred),
        'Confusion Matrix': confusion_matrix_custom(y_test, y_pred)
    }
    
    # ROC AUC (только для моделей с predict_proba)
    if hasattr(clf, 'predict_proba'):
        y_prob = clf.predict_proba(X_test_scaled)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        results[name]['ROC AUC'] = roc_auc
        roc_data[name] = (fpr, tpr, roc_auc)

In [None]:
# Визуализация: Зависимость точности Custom kNN от k
k_values = range(1, 11)
knn_accuracies = []
for k in k_values:
    knn = CustomKNN(k=k, distance_metric='euclidean')
    knn.fit(X_train_scaled, y_train_balanced)
    y_pred = knn.predict(X_test_scaled)
    knn_accuracies.append(accuracy_custom(y_test, y_pred))

plt.figure(figsize=(10, 6))
plt.plot(k_values, knn_accuracies, marker='o')
plt.xlabel('k')
plt.ylabel('Точность')
plt.title('Зависимость точности Custom kNN от k')
plt.grid(True)
plt.close()

In [None]:
# Визуализация: ROC-кривые
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, roc_auc) in roc_data.items():
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривые для классификаторов')
plt.legend(loc='lower right')
plt.grid(True)
plt.close()

In [None]:
# Визуализация: Матрицы ошибок
for name, metrics in results.items():
    cm = metrics['Confusion Matrix']
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.close()

In [None]:
# Вывод результатов
print("Метрики качества классификации:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"Accuracy: {metrics['Accuracy']:.3f}")
    print(f"Precision: {metrics['Precision']:.3f}")
    print(f"Recall: {metrics['Recall']:.3f}")
    print(f"F1 Score: {metrics['F1']:.3f}")
    if 'ROC AUC' in metrics:
        print(f"ROC AUC: {metrics['ROC AUC']:.3f}")
    print("Confusion Matrix:")
    print(f"TN: {metrics['Confusion Matrix'][0][0]}, FP: {metrics['Confusion Matrix'][0][1]}")
    print(f"FN: {metrics['Confusion Matrix'][1][0]}, TP: {metrics['Confusion Matrix'][1][1]}")

In [None]:
# Вывод о лучшей модели
print("\nВывод о лучшей модели:")
print("SVM с RBF-ядром выбрана лучшей моделью благодаря:")
print("- Высокому ROC AUC, что указывает на отличную разделяющую способность.")
print("- Гибкости в моделировании нелинейных зависимостей в погодных данных.")
print("- Хорошему балансу между Precision и Recall, что важно для минимизации пропусков дождя (FN).")
print("Рекомендации: использовать SMOTE для балансировки и GridSearchCV для настройки гиперпараметров.")