In [4]:
import datetime
import pytz
def export_csv(df):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y-%m-%d %H時%M分%S秒')
  df.to_csv(formatted_time + "predict.csv", index=False,encoding="utf_8_sig")
  # df.to_csv(corpus_root + formatted_time + "predict.csv", index=False,encoding="utf_8_sig")

### 架設模型

In [None]:
import re

# 情緒字典標記函數
emotion_keywords = {
    1: ['boy', 'girlfriend', 'man','handsome','87'],  # 正面關鍵詞列表
    -1: ['girl', 'pretty','beautiful']  # 負面關鍵詞列表
}

def emotion_label(text):
    # 检查是否为有效字符串或字节对象
    if isinstance(text, str) or isinstance(text, bytes):
        # 中文检查
        if re.search(r'[\u4e00-\u9fff]+', text):
            return 1

        # 分词
        words = re.findall(r'\b\w+\b', text)

        # 检查正面情绪关键词
        if any(word in emotion_keywords[1] for word in words):
            return 1

        # 检查负面情绪关键词
        if any(word in emotion_keywords[-1] for word in words):
            return -1

    # 如果不是有效的字符串或字节对象，返回中性情绪标签
    return 0

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
import numpy as np

# 讀取訓練資料集
train_data = pd.read_csv(r"..\KNN\dataset\KNN_without_outlier.csv")

# 套用情緒字典
train_data['self_intro'] = train_data['self_intro'].str.lower()
train_data['emotion'] = np.vectorize(emotion_label)(train_data['self_intro'])

# 將self_intro欄位從訓練資料中移除，因為這裡不打算使用該欄位作為特徵
train_data = train_data.drop(columns=['self_intro'])

# 將性別標籤設置為0和1，其中1代表男性，2代表女性
train_data['gender'] = train_data['gender'].apply(lambda x: 1 if x == 1 else 0)

# 將資料集分為特徵（X）和標籤（y）
X = train_data.drop(columns=['gender'])
y = train_data['gender']

# 將資料集分為訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化標準化器
scaler = StandardScaler()

# 對訓練集和驗證集進行標準化
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 初始化隨機森林分類器
rf = RandomForestClassifier(random_state=42)

# 定義超參數範圍
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 使用 RapidSearch (GridSearchCV) 進行超參數優化
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 獲取最佳模型
best_rf = grid_search.best_estimator_

# 在驗證集上進行預測
val_predictions = best_rf.predict(X_val_scaled)

# 計算模型在驗證集上的準確率
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", accuracy)

# 進行測試資料集的預測
test_data = pd.read_csv(r"..\KNN\dataset\test_KNN_without_outlier.csv")

# 套用情緒字典
test_data['self_intro'] = test_data['self_intro'].astype(str).str.lower()
test_data['emotion'] = np.vectorize(emotion_label)(test_data['self_intro'])

test_data = test_data.drop(columns=['self_intro', 'id', 'gender'])
test_data_scaled = scaler.transform(test_data)
test_predictions = best_rf.predict(test_data_scaled)

# 建立新的 DataFrame 來存放預測結果
result_df = pd.DataFrame({'ID': range(1, len(test_predictions) + 1), 'gender': [2 if pred == 0 else pred for pred in test_predictions]})

# 將結果存入新的 CSV 檔案中
result_df.to_csv('prediction_result_RF_rapid.csv', index=False)

# 輸出預測結果
print(result_df)


Validation Accuracy: 0.9518072289156626
      ID  gender
0      1       1
1      2       1
2      3       2
3      4       1
4      5       1
..   ...     ...
390  391       2
391  392       1
392  393       2
393  394       1
394  395       1

[395 rows x 2 columns]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 假设 preprocess_data 是您预处理后的数据
X = preprocess_data.drop(['gender', 'id', 'self_intro'], axis=1)
y = preprocess_data['gender']  # 标签

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化SVM模型
svm_model = SVC(kernel='linear')  # 使用线性核，您可以根据需要调整参数

# 训练模型
svm_model.fit(X_train, y_train)

# 预测测试集
y_pred = svm_model.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.3f}')

In [8]:
X_test['gender'] = y_test
X_test['pred'] = y_pred
X_test

Unnamed: 0,height,weight,sleepiness,iq,fb_friends,yt,star_sign_天秤座,star_sign_天蠍座,star_sign_射手座,star_sign_巨蟹座,...,star_sign_獅子座,star_sign_處女座,star_sign_金牛座,star_sign_雙子座,star_sign_雙魚座,phone_os_Android,phone_os_Apple,emotion,gender,pred
118,-0.044873,-1.470766,1.0,122.0,1029.0,9.0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
300,-0.044873,0.396164,5.0,120.0,1300.0,5.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
247,0.797630,1.771796,1.0,180.0,1.0,100.0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,1
58,0.095544,0.003126,3.0,120.0,400.0,0.2,0,0,0,0,...,0,0,0,1,0,0,0,-1,2,2
75,-0.044873,-0.389912,4.0,120.0,650.0,5.0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,0.797630,0.052256,3.0,100.0,400.0,2.0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
365,0.095544,0.052256,3.0,120.0,600.0,2.0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,1
140,0.095544,0.887461,2.0,180.0,400.0,2.0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
3,-0.044873,-0.193393,4.0,100.0,173.0,5.0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1


In [9]:
filtered_data = X_test[X_test['gender'] != X_test['pred']]
filtered_data

Unnamed: 0,height,weight,sleepiness,iq,fb_friends,yt,star_sign_天秤座,star_sign_天蠍座,star_sign_射手座,star_sign_巨蟹座,...,star_sign_獅子座,star_sign_處女座,star_sign_金牛座,star_sign_雙子座,star_sign_雙魚座,phone_os_Android,phone_os_Apple,emotion,gender,pred
105,0.095544,-0.389912,2.0,135.0,400.0,2.0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,1
205,-0.325708,-1.470766,1.0,120.0,2000.0,2.0,0,0,0,0,...,1,0,0,0,0,0,1,0,2,1
87,0.095544,-1.372506,3.0,100.0,50.0,3.0,0,0,0,0,...,0,0,0,0,0,1,0,0,2,1
357,0.095544,-0.881209,3.0,120.0,600.0,1.0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,1
22,-0.044873,-0.68469,4.0,100.0,1200.0,2.0,0,0,0,0,...,0,0,0,1,0,1,0,0,2,1
365,0.095544,0.052256,3.0,120.0,600.0,2.0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,1


### 測試集
# 全部為395
#全男生1為0.65989 約 261
#全女生2為0.34010 約 134

In [15]:
test_data

Unnamed: 0,id,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,1,0,天蠍座,Apple,,100.0,1.0,87.000,87.0,87.0,GOod
1,2,0,金牛座,Apple,175.0,80.0,3.0,130.000,2000.0,30.0,Easygoing
2,3,0,雙子座,Apple,155.0,45.0,3.0,150.000,400.0,9.0,I LOVE INTEL
3,4,0,處女座,Apple,173.0,85.0,4.0,100.000,2000.0,15.0,"I'm a hard-work man, just do my best to finish..."
4,5,0,射手座,Android,164.0,57.0,4.0,130.000,505.0,2.0,I'm smart
...,...,...,...,...,...,...,...,...,...,...,...
390,391,0,處女座,Android,160.0,48.0,3.0,75.000,98.0,2.0,Starting by Starting
391,392,0,,Apple,170.0,,,105.000,510.0,,A little bit smart
392,393,0,金牛座,Apple,160.0,45.0,4.0,100.000,600.0,2000.0,Hi
393,394,0,巨蟹座,,180.0,,,199.999,,60.0,"I'm not beautiful, but smart 😀"


In [31]:
# 假设 test_data 是您的测试数据

# 与训练数据相同的预处理步骤
test_data_processed = test_data.copy()  # 复制测试数据

## 删除手機極端值

test_data_processed['star_sign'] = test_data_processed['star_sign'].astype('category')
test_data_processed['phone_os'] = test_data_processed['phone_os'].astype('category')
test_data_processed = pd.get_dummies(test_data_processed, columns=['star_sign', 'phone_os'])
test_data_processed.replace({True: 1, False: 0}, inplace=True)

## 处理缺失值
test_data_processed['height'] = test_data_processed['height'].fillna(preprocess_data['height'].median())
test_data_processed['weight'] = test_data_processed['weight'].fillna(preprocess_data['weight'].median())
test_data_processed['sleepiness'] = test_data_processed['sleepiness'].fillna(preprocess_data['sleepiness'].median())
test_data_processed['iq'] = test_data_processed['iq'].fillna(preprocess_data['iq'].median())
test_data_processed['fb_friends'] = test_data_processed['fb_friends'].fillna(preprocess_data['fb_friends'].median())
test_data_processed['yt'] = pd.to_numeric(test_data_processed['yt'], errors='coerce')
test_data_processed['yt'] = test_data_processed['yt'].fillna(preprocess_data['yt'].median())


## 标准化
test_data_processed[['height', 'weight']] = scaler.transform(test_data_processed[['height', 'weight']])

test_data_processed

  test_data_processed.replace({True: 1, False: 0}, inplace=True)


Unnamed: 0,id,gender,height,weight,sleepiness,iq,fb_friends,yt,self_intro,star_sign_天秤座,...,star_sign_水瓶座,star_sign_牡羊座,star_sign_獅子座,star_sign_處女座,star_sign_金牛座,star_sign_雙子座,star_sign_雙魚座,phone_os_Android,phone_os_Apple,phone_os_Windows phone
0,1,0,-23.902397,3.540466,1.0,87.000,87.0,87.0,GOod,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,0.657213,1.575277,3.0,130.000,2000.0,30.0,Easygoing,0,...,0,0,0,0,1,0,0,0,1,0
2,3,0,-2.151133,-1.863803,3.0,150.000,400.0,9.0,I LOVE INTEL,0,...,0,0,0,0,0,1,0,0,1,0
3,4,0,0.376378,2.066574,4.0,100.000,2000.0,15.0,"I'm a hard-work man, just do my best to finish...",0,...,0,0,0,1,0,0,0,0,1,0
4,5,0,-0.887377,-0.684690,4.0,130.000,505.0,2.0,I'm smart,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,391,0,-1.449046,-1.569025,3.0,75.000,98.0,2.0,Starting by Starting,0,...,0,0,0,1,0,0,0,1,0,0
391,392,0,-0.044873,-6.280343,3.0,105.000,510.0,2.0,A little bit smart,0,...,0,0,0,0,0,0,0,0,1,0
392,393,0,-1.449046,-1.863803,4.0,100.000,600.0,2000.0,Hi,0,...,0,0,0,0,1,0,0,0,1,0
393,394,0,1.359300,-6.280343,3.0,199.999,400.0,60.0,"I'm not beautiful, but smart 😀",0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
import numpy as np  # 导入 numpy 库

def emotion_label(text):
    # 检查是否为有效字符串或字节对象
    if isinstance(text, str) or isinstance(text, bytes):
        # 中文检查
        if re.search(r'[\u4e00-\u9fff]+', text):
            return 1

        # 分词
        words = re.findall(r'\b\w+\b', text)

        # 检查正面情绪关键词
        if any(word in emotion_keywords[1] for word in words):
            return 1

        # 检查负面情绪关键词
        if any(word in emotion_keywords[-1] for word in words):
            return -1

    # 如果不是有效的字符串或字节对象，返回中性情绪标签
    return 0

# 应用函数标记情绪
test_data_processed['self_intro'] = test_data_processed['self_intro'].str.lower()
# 使用 np.vectorize 将函数应用到 Series，忽略错误
test_data_processed['emotion'] = np.vectorize(emotion_label)(test_data_processed['self_intro'])
test_data_processed

Unnamed: 0,id,gender,height,weight,sleepiness,iq,fb_friends,yt,self_intro,star_sign_天秤座,...,star_sign_牡羊座,star_sign_獅子座,star_sign_處女座,star_sign_金牛座,star_sign_雙子座,star_sign_雙魚座,phone_os_Android,phone_os_Apple,phone_os_Windows phone,emotion
0,1,0,-23.902397,3.540466,1.0,87.000,87.0,87.0,good,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0,0.657213,1.575277,3.0,130.000,2000.0,30.0,easygoing,0,...,0,0,0,1,0,0,0,1,0,0
2,3,0,-2.151133,-1.863803,3.0,150.000,400.0,9.0,i love intel,0,...,0,0,0,0,1,0,0,1,0,0
3,4,0,0.376378,2.066574,4.0,100.000,2000.0,15.0,"i'm a hard-work man, just do my best to finish...",0,...,0,0,1,0,0,0,0,1,0,1
4,5,0,-0.887377,-0.684690,4.0,130.000,505.0,2.0,i'm smart,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,391,0,-1.449046,-1.569025,3.0,75.000,98.0,2.0,starting by starting,0,...,0,0,1,0,0,0,1,0,0,0
391,392,0,-0.044873,-6.280343,3.0,105.000,510.0,2.0,a little bit smart,0,...,0,0,0,0,0,0,0,1,0,0
392,393,0,-1.449046,-1.863803,4.0,100.000,600.0,2000.0,hi,0,...,0,0,0,1,0,0,0,1,0,0
393,394,0,1.359300,-6.280343,3.0,199.999,400.0,60.0,"i'm not beautiful, but smart 😀",0,...,0,0,0,0,0,0,0,0,0,-1


In [33]:
test_data_processed = test_data_processed.drop(['gender', 'id', 'self_intro'], axis=1)
test_data_processed = test_data_processed[X_train.columns]
test_data_processed

# 使用训练好的 SVM 模型进行预测
y_pred_test = svm_model.predict(test_data_processed)

# 输出预测结果
print(y_pred_test)

[2 1 2 1 2 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1
 1 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 2
 2 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1
 1 1 2 1 2 1 1 1 1 2 2 2 1 2 1 2 1 1 2 2 1 2 2 1 1 1 2 1 1 2 1 1 1 2 1 1 2
 1 1 1 1 2 2 1 2 1 1 2 2 2 1 2 1 2 1 1 2 1 1 1 2 2 2 2 1 2 1 1 1 2 2 1 1 2
 2 1 1 2 2 1 1 2 2 1 2 1 1 1 1 2 2 1 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 1 2 1 1
 1 2 1 2 1 2 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1
 1 1 1 1 2 2 1 2 2 1 1 2 2 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
 2 1 1 2 2 2 2 1 1 1 1 2 2 1 2 1 1 2 1 2 2 1 1 1 2 1 2 2 2 2 1 1 1 2 1 2 1
 2 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1 2 1 1 1 2 2 1 2 1 1 2 2 2 1 2 1 1 1 1 1 2
 1 2 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 2 1 2 2 2 2 2 1]


In [37]:
test_data['gender'] = y_pred_test
test_data['gender']

0      2
1      1
2      2
3      1
4      2
      ..
390    2
391    2
392    2
393    2
394    1
Name: gender, Length: 395, dtype: int64

In [38]:
output = test_data[['id', 'gender']]
output

Unnamed: 0,id,gender
0,1,2
1,2,1
2,3,2
3,4,1
4,5,2
...,...,...
390,391,2
391,392,2
392,393,2
393,394,2


In [40]:
export_csv(output)