Загрузка необходимых библиотек

In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Загрузка и подготовка данных

In [97]:
df = pd.read_csv('possum.csv')
df = df.dropna()
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [98]:
df.dtypes

case          int64
site          int64
Pop          object
sex          object
age         float64
hdlngth     float64
skullw      float64
totlngth    float64
taill       float64
footlgth    float64
earconch    float64
eye         float64
chest       float64
belly       float64
dtype: object

In [99]:
df['sex'].value_counts()

sex
m    59
f    42
Name: count, dtype: int64

Разделение данных на признаки и целевую переменную. Разделение на обучающую и тестовую выборки.

In [100]:
Y = df['sex']
X = df.loc[:, (df.columns !='sex') & (df.columns !='Pop')] 

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

Стандартизация данных

In [101]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Понижение размерности

In [102]:
nums = np.arange(df.shape[1]-1)
var_ratio = dict()

for num in nums:
    pca = PCA(n_components=num)
    pca.fit(X_train_scaled)
    var_ratio[num] = (np.sum(pca.explained_variance_ratio_), pca)

var_ratio

{0: (0.0, PCA(n_components=0)),
 1: (0.3915418707080796, PCA(n_components=1)),
 2: (0.6380370045668703, PCA(n_components=2)),
 3: (0.7257213002817198, PCA(n_components=3)),
 4: (0.800231176199247, PCA(n_components=4)),
 5: (0.8597023204773573, PCA(n_components=5)),
 6: (0.9030541842844486, PCA(n_components=6)),
 7: (0.9384494139592252, PCA(n_components=7)),
 8: (0.9608970568762618, PCA(n_components=8)),
 9: (0.9780803605616347, PCA(n_components=9)),
 10: (0.9894926473820839, PCA(n_components=10)),
 11: (0.9982080882848396, PCA(n_components=11)),
 12: (1.0000000000000002, PCA(n_components=12))}

In [104]:
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)

In [105]:
# Создание и обучение моделей
log_reg = LogisticRegression(penalty='l2')
log_reg.fit(X_train_scaled, y_train)

rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)

# Предсказание на тестовых данных
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

# Оценка моделей
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_log_reg = classification_report(y_test, y_pred_log_reg)
report_rf = classification_report(y_test, y_pred_rf)

print(f"Accuracy of logistic regression: {accuracy_log_reg}")
print(report_log_reg)
print(f"Accuracy of random forest: {accuracy_rf}")
print(report_rf)

Accuracy of logistic regression: 0.7419354838709677
              precision    recall  f1-score   support

           f       0.65      0.85      0.73        13
           m       0.86      0.67      0.75        18

    accuracy                           0.74        31
   macro avg       0.75      0.76      0.74        31
weighted avg       0.77      0.74      0.74        31

Accuracy of random forest: 0.6129032258064516
              precision    recall  f1-score   support

           f       0.55      0.46      0.50        13
           m       0.65      0.72      0.68        18

    accuracy                           0.61        31
   macro avg       0.60      0.59      0.59        31
weighted avg       0.61      0.61      0.61        31

