In [2]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# 加载数据
data = pd.read_csv('new_output.csv')

# 数据预处理
# 删除不用于预测的列
data = data.drop(columns=['jobSalaryMax', 'jobSalaryMin', 'term', 'salary_max_range', 'jobSalaryMaxLevel', 'jobSalaryMinLevel'])

# 编码分类特征
label_encoders = {}
categorical_features = ['jobType', 'workAreaCode', 'degree', 'industryType1', 'companyType', 'companySizeCode']
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    data[feature] = label_encoders[feature].fit_transform(data[feature])

# 准备目标变量
target = 'jobSalaryMaxLevel'
y = data[target]
y = to_categorical(y)

# 准备特征变量
X = data.drop(columns=[target])

# 标准化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 为CNN输入重塑数据
X = np.expand_dims(X, axis=2)

KeyError: 'jobSalaryMaxLevel'

In [None]:
# 训练集和测试集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 构建CNN模型
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 训练模型
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# 评估模型
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# 输出分类报告和准确率
print("Classification Report:\n", classification_report(y_true, y_pred_classes))
print("Accuracy:", accuracy_score(y_true, y_pred_classes))

In [None]:
# 可视化训练过程
import matplotlib.pyplot as plt

# 绘制训练和验证的准确率曲线
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy Curve')

# 绘制训练和验证的损失曲线
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Curve')

plt.show()