In [33]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import std
from collections import Counter
from numpy import mean
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 从文本文件读取数据
with open('data.txt', 'r') as file:
    lines = file.readlines()

diabetes = pd.DataFrame()

# 逐行处理数据并添加到DataFrame
for line in lines:
    # 按空格分割每一行
    parts = line.strip().split()
    
    # 提取种类标签
    label = int(parts[0])
    
    # 提取特征和对应的值
    features = {}
    for part in parts[1:]:
        feature_number, feature_value = part.split(':')
        features[int(feature_number)] = float(feature_value)
    
    # 合并种类和特征，然后将其添加到DataFrame
    row_data = {'Label': label, **features}
    diabetes = diabetes.append(row_data, ignore_index=True)

# 打印前几行数据
print(diabetes.head())


   Label         1         2         3         4         5         6  \
0   -1.0 -0.294118  0.487437  0.180328 -0.292929 -1.000000  0.001490   
1    1.0 -0.882353 -0.145729  0.081967 -0.414141 -1.000000 -0.207153   
2   -1.0 -0.058824  0.839196  0.049180 -1.000000 -1.000000 -0.305514   
3    1.0 -0.882353 -0.105528  0.081967 -0.535354 -0.777778 -0.162444   
4   -1.0 -1.000000  0.376884 -0.344262 -0.292929 -0.602837  0.284650   

          7         8  
0 -0.531170 -0.033333  
1 -0.766866 -0.666667  
2 -0.492741 -0.633333  
3 -0.923997 -1.000000  
4  0.887276 -0.600000  


In [34]:
diabetes = diabetes.dropna(subset=[1]) # 过滤掉nan数据
diabetes = diabetes.dropna(subset=[2]) # 过滤掉nan数据
diabetes = diabetes.dropna(subset=[3]) # 过滤掉nan数据
diabetes = diabetes.dropna(subset=[4]) # 过滤掉nan数据
diabetes = diabetes.dropna(subset=[5]) 
diabetes = diabetes.dropna(subset=[6]) 
diabetes = diabetes.dropna(subset=[7]) 
diabetes = diabetes.dropna(subset=[8]) # 过滤掉nan数据

In [35]:
import numpy as np

class Perceptron:
    def __init__(self, input_size, learning_rate=0.1, epochs=100):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = np.zeros(input_size + 1)  # Additional weight for the bias
        self.history = []

    def predict(self, inputs):
        summation = np.dot(inputs, self.weights[1:]) + self.weights[0]
        return 1 if summation > 0 else -1

    def train(self, training_data, labels):
        for _ in range(self.epochs):
            errors = 0
            for inputs, label in zip(training_data, labels):
                prediction = self.predict(inputs)
                update = self.learning_rate * (label - prediction)
                self.weights[1:] += update * inputs
                self.weights[0] += update
                errors += int(update != 0)
            self.history.append(errors)



In [36]:
          
if __name__ == '__main__':

    # 划分特征和标签
    X = diabetes[[1,2,3,4,5,6,7,8]]
    X=X.values
    y = diabetes['Label'] 
    y= y.values
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_pred=[ ]
    perceptron = Perceptron(input_size=8)
    perceptron.train(X_train,y_train)
    for inputs in X_test:
        prediction = perceptron.predict(inputs)
        y_pred.append(prediction)
        #print("Prediction for {}: {}".format(inputs[1:], prediction))
 
 
    print(classification_report(y_test, y_pred, target_names=['non-diabetes', 'diabetes' ]))
 

              precision    recall  f1-score   support

non-diabetes       0.88      0.15      0.25        47
    diabetes       0.72      0.99      0.84       105

    accuracy                           0.73       152
   macro avg       0.80      0.57      0.54       152
weighted avg       0.77      0.73      0.66       152



In [37]:
# 逻辑回归预测模型

# 划分特征和标签
X = diabetes[[1,2,3,4,5,6,7,8 ]]
y = diabetes['Label'] 


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立逻辑回归模型
lr = LogisticRegression(solver="newton-cg")
lr.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = lr.predict(X_test)

# 评估模型表现
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy：", accuracy)

Model accuracy： 0.8355263157894737


In [38]:
# 输出逻辑回归预测模型评估
print(classification_report(y_test, y_pred, target_names=['non-diabetes', 'diabetes']))

              precision    recall  f1-score   support

non-diabetes       0.79      0.64      0.71        47
    diabetes       0.85      0.92      0.89       105

    accuracy                           0.84       152
   macro avg       0.82      0.78      0.80       152
weighted avg       0.83      0.84      0.83       152



In [39]:
# 随机森林预测模型

# 划分特征和标签
X = diabetes[[1,2,3,4,5,6,7,8]]
y = diabetes['Label']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# 评估模型表现
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy：", accuracy)

Model accuracy： 0.7763157894736842


In [40]:
print(classification_report(y_test, y_pred, target_names=['non-diabetes', 'diabetes']))

              precision    recall  f1-score   support

non-diabetes       0.66      0.57      0.61        47
    diabetes       0.82      0.87      0.84       105

    accuracy                           0.78       152
   macro avg       0.74      0.72      0.73       152
weighted avg       0.77      0.78      0.77       152

