In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import load_model

# 读取数据
data = pd.read_csv('data.csv')

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data['name'], data['label'], test_size=0.2, random_state=42)

# 特征提取
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 保存vectorizer
joblib.dump(vectorizer, './parameter/tfidf_vectorizer.pkl')

# 定义自定义神经网络模型
def create_nn_model():
    model = Sequential()
    model.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrapping the Keras model so it can be used with scikit-learn
nn_model = KerasClassifier(build_fn=create_nn_model, epochs=10, batch_size=32, verbose=0)

# 定义模型
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Neural Network": nn_model
}

# 训练和评估模型
for model_name, model in models.items():
    print(f"Training {model_name}...")
    # 训练模型
    model.fit(X_train_tfidf, y_train)
    # 保存模型
    # if model_name == "Neural Network":
        # model.model.save(f'{model_name}.h5')
    # else:
    joblib.dump(model, f'./parameter/{model_name}.pkl')
    # 预测
    y_pred = model.predict(X_test_tfidf)
    # 计算精确度
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    # 打印分类报告
    print(f"{model_name} Classification Report:\n")
    print(classification_report(y_test, y_pred))



Training Logistic Regression...
Logistic Regression Accuracy: 0.7662
Logistic Regression Classification Report:

              precision    recall  f1-score   support

         Bot       0.98      0.37      0.54       155
   Community       0.00      0.00      0.00        16
     Company       0.00      0.00      0.00         2
      Domain       0.75      0.99      0.85       460
  Foundation       0.00      0.00      0.00        11
     Project       0.50      0.67      0.57         6
        Tech       0.80      0.13      0.23        30

    accuracy                           0.77       680
   macro avg       0.43      0.31      0.31       680
weighted avg       0.77      0.77      0.71       680

Training SVM...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Accuracy: 0.7735
SVM Classification Report:

              precision    recall  f1-score   support

         Bot       0.98      0.39      0.56       155
   Community       0.00      0.00      0.00        16
     Company       0.00      0.00      0.00         2
      Domain       0.75      0.99      0.86       460
  Foundation       0.00      0.00      0.00        11
     Project       0.86      1.00      0.92         6
        Tech       0.67      0.13      0.22        30

    accuracy                           0.77       680
   macro avg       0.47      0.36      0.37       680
weighted avg       0.77      0.77      0.72       680

Training Random Forest...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Accuracy: 0.7647
Random Forest Classification Report:

              precision    recall  f1-score   support

         Bot       0.87      0.42      0.57       155
   Community       0.00      0.00      0.00        16
     Company       0.00      0.00      0.00         2
      Domain       0.76      0.97      0.85       460
  Foundation       0.00      0.00      0.00        11
     Project       0.86      1.00      0.92         6
        Tech       0.45      0.17      0.24        30

    accuracy                           0.76       680
   macro avg       0.42      0.36      0.37       680
weighted avg       0.74      0.76      0.72       680

Training Neural Network...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  X, y = self._initialize(X, y)


INFO:tensorflow:Assets written to: ram://05245fac18454da5822ff182e35fd1e6/assets


INFO:tensorflow:Assets written to: ram://05245fac18454da5822ff182e35fd1e6/assets


Neural Network Accuracy: 0.2279
Neural Network Classification Report:

              precision    recall  f1-score   support

         Bot       0.23      1.00      0.37       155
   Community       0.00      0.00      0.00        16
     Company       0.00      0.00      0.00         2
      Domain       0.00      0.00      0.00       460
  Foundation       0.00      0.00      0.00        11
     Project       0.00      0.00      0.00         6
        Tech       0.00      0.00      0.00        30

    accuracy                           0.23       680
   macro avg       0.03      0.14      0.05       680
weighted avg       0.05      0.23      0.08       680



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# 读取保存的模型参数并进行分类任务
def load_model_and_predict(model_name, new_data):
    vectorizer = joblib.load('./parameter/tfidf_vectorizer.pkl')
    X_new_tfidf = vectorizer.transform(new_data)

    if model_name == "Neural Network":
        model = load_model(f'./parameter/{model_name}.h5')
        y_pred = model.predict(X_new_tfidf).flatten()
        y_pred = [1 if y > 0.5 else 0 for y in y_pred]
    else:
        model = joblib.load(f'./parameter/{model_name}.pkl')
        y_pred = model.predict(X_new_tfidf)
    
    return y_pred

# 新的分类任务数据
new_data = pd.Series(["cbdddbot", "新的名字2", "open/asbsdv"])

# 加载模型并进行分类
# 可以选择不同的模型
# SVM
# Random Forest
# Logistic Regression
model_name = "SVM" 
predictions = load_model_and_predict(model_name, new_data)

# 展示分类结果
for name, label in zip(new_data, predictions):
    print(f"Name: {name}, Predicted Label: {label}")



Name: cbdddbot, Predicted Label: Domain
Name: 新的名字2, Predicted Label: Domain
Name: open/asbsdv, Predicted Label: Tech
