In [1]:
import pandas as pd

In [23]:
df = pd.read_csv("twidata.csv")
df

Unnamed: 0,author_id,created_at,in_reply_to_user_id,possibly_sensitive,retweet_count,like_count,text,label
0,2.902546e+09,2016-09-01 01:06:22+00:00,,False,22,73,"And now, the time has come for us to sign off....",human
1,2.902546e+09,2016-08-31 23:56:32+00:00,,False,6,7,"Meanwhile back in Yemen, reports continue to f...",human
2,2.919637e+09,2022-02-20 10:46:50+00:00,,False,0,1,Mudavadi is telling Kenyans that Hon Raila is ...,bot
3,2.919637e+09,2022-02-18 17:43:39+00:00,,False,0,1,Kenyans have already chosen Raila as the best ...,bot
4,2.919637e+09,2022-02-18 17:39:40+00:00,,False,0,0,Mr DP Ruto resign now!!!. Why becoming a doubl...,bot
...,...,...,...,...,...,...,...,...
29995,1.624546e+07,2022-02-22 02:55:08+00:00,16245462.0,False,1,1,"...and no, I refuse to hot link these shit sta...",human
29996,4.552087e+07,2022-02-21 05:52:44+00:00,,False,0,0,I need Cassie in a horror movie,human
29997,4.552087e+07,2022-02-21 05:41:01+00:00,,False,0,1,East highland high a mess,human
29998,4.552087e+07,2022-02-21 04:17:39+00:00,,False,0,0,Okay but luna lowkey eating kat up,human


# twibot_tweet0_new.csvを用いた学習

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline as ImbPipeline
import re
from textblob import TextBlob
from tqdm import tqdm
import time
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb

def extract_features(df):
    df['text_length'] = df['text'].str.len()
    df['has_url'] = df['text'].str.contains('http').astype(int)
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
    df['is_reply'] = df['in_reply_to_user_id'].notna().astype(int)
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))
    df['retweet_like_ratio'] = df['retweet_count'] / (df['like_count'] + 1)
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['unique_words_ratio'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['tweet_frequency'] = df.groupby('author_id')['created_at'].transform('count')
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in '.,;:!?'))
    df['emoji_count'] = df['text'].apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', x)))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text'].apply(lambda x: x.count('?'))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))
    df['digit_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['special_char_count'] = df['text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', x)))
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_char_ratio'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['tweet_hour_bin'] = pd.cut(df['hour'], bins=4, labels=[0,1,2,3])
    df['tweet_day_bin'] = pd.cut(df['day_of_week'], bins=3, labels=[0,1,2])
    df['text_complexity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    return df

def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

class XGBoostProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, display_interval=100, total_iterations=1000):
        self.display_interval = display_interval
        self.total_iterations = total_iterations

    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.display_interval == 0 or (epoch + 1) == self.total_iterations:
            print(f'XGBoost progress: {epoch + 1}/{self.total_iterations} iterations')
        return False

print("データの読み込みと前処理を開始します...")
df = pd.read_csv('twibot_tweet0_new.csv')
df = extract_features(df)
print("前処理が完了しました。")

X_text = df['text']
numeric_features = [col for col in df.columns if col not in ['text', 'label', 'author_id', 'created_at', 'in_reply_to_user_id']]
X_numeric = df[numeric_features]
y = df['label'].map({'human': 0, 'bot': 1})

X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=25000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

print("BERTモデルを使用して文章の埋め込みを生成します...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
X_train_bert = get_bert_embeddings(X_train_text, model, tokenizer)
X_test_bert = get_bert_embeddings(X_test_text, model, tokenizer)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_numeric_scaled, X_train_bert])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_numeric_scaled, X_test_bert])

variance_threshold = VarianceThreshold(threshold=0.005)
X_train_var_selected = variance_threshold.fit_transform(X_train_combined)
X_test_var_selected = variance_threshold.transform(X_test_combined)

selector = SelectKBest(f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_var_selected, y_train)
X_test_selected = selector.transform(X_test_var_selected)

# SMOTEとENNの比率を調整
smote = SMOTE(sampling_strategy=0.8, random_state=42)
enn = EditedNearestNeighbours() 
steps = [('over', smote), ('under', enn)]
pipeline = ImbPipeline(steps=steps)



X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_selected, y_train)
# モデルのハイパーパラメータを調整
models = {
    'RandomForest': RandomForestClassifier(n_estimators=1500, max_depth=40, min_samples_split=5, min_samples_leaf=2, class_weight='balanced', random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=1500, learning_rate=0.03, max_depth=8, min_child_weight=2, subsample=0.8, colsample_bytree=0.8, random_state=42, callbacks=[XGBoostProgressCallback(display_interval=200, total_iterations=1500)], n_jobs=-1),
    'SVM': SVC(kernel='rbf', C=50, gamma='auto', probability=True, class_weight='balanced', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(300, 150, 75), max_iter=1500, alpha=0.0005, learning_rate='adaptive', random_state=42)
}

print("モデルの学習を開始します...")
for name, model in tqdm(models.items()):
    print(f"{name}の学習を開始します...")
    start_time = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    print(f"{name}の学習が完了しました。学習時間: {end_time - start_time:.2f}秒")

print("アンサンブル予測を行います...")
y_pred_rf = models['RandomForest'].predict_proba(X_test_selected)[:, 1]
y_pred_xgb = models['XGBoost'].predict_proba(X_test_selected)[:, 1]
y_pred_svm = models['SVM'].predict_proba(X_test_selected)[:, 1]
y_pred_mlp = models['MLP'].predict_proba(X_test_selected)[:, 1]

# アンサンブルの重みを調整
y_pred_ensemble = 0.35 * y_pred_rf + 0.35 * y_pred_xgb + 0.15 * y_pred_svm + 0.15 * y_pred_mlp
y_pred_final = (y_pred_ensemble > 0.2).astype(int)  # 閾値を0.3から0.2に下げてボット検出を改善

# 閾値の最適化
from sklearn.metrics import f1_score

def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 1.0, 0.05)
    f1_scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
    return thresholds[np.argmax(f1_scores)]

optimal_threshold = find_optimal_threshold(y_test, y_pred_ensemble)
y_pred_final = (y_pred_ensemble > optimal_threshold).astype(int)

print(f"最適な閾値: {optimal_threshold}")
print("最終的な予測結果:")
print(classification_report(y_test, y_pred_final, target_names=['human', 'bot']))

データの読み込みと前処理を開始します...
前処理が完了しました。
BERTモデルを使用して文章の埋め込みを生成します...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [07:10<00:00, 18.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:54<00:00, 17.40it/s]


モデルの学習を開始します...


  0%|                                                                                                         | 0/4 [00:00<?, ?it/s]

RandomForestの学習を開始します...


 25%|████████████████████████▎                                                                        | 1/4 [01:09<03:29, 69.79s/it]

RandomForestの学習が完了しました。学習時間: 69.78秒
XGBoostの学習を開始します...
XGBoost progress: 200/1500 iterations
XGBoost progress: 400/1500 iterations
XGBoost progress: 600/1500 iterations
XGBoost progress: 800/1500 iterations
XGBoost progress: 1000/1500 iterations
XGBoost progress: 1200/1500 iterations
XGBoost progress: 1400/1500 iterations


 50%|████████████████████████████████████████████████▌                                                | 2/4 [02:08<02:07, 63.53s/it]

XGBoost progress: 1500/1500 iterations
XGBoostの学習が完了しました。学習時間: 59.15秒
SVMの学習を開始します...


 75%|████████████████████████████████████████████████████████████████████████▊                        | 3/4 [02:57<00:56, 56.86s/it]

SVMの学習が完了しました。学習時間: 48.92秒
MLPの学習を開始します...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:12<00:00, 48.14s/it]

MLPの学習が完了しました。学習時間: 14.71秒
アンサンブル予測を行います...





最適な閾値: 0.8500000000000002
最終的な予測結果:
              precision    recall  f1-score   support

       human       0.93      0.78      0.85      1796
         bot       0.20      0.50      0.29       204

    accuracy                           0.75      2000
   macro avg       0.57      0.64      0.57      2000
weighted avg       0.86      0.75      0.79      2000



In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
import re
from textblob import TextBlob
from tqdm import tqdm
import time
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb

def extract_features(df):
    df['text_length'] = df['text'].str.len()
    df['has_url'] = df['text'].str.contains('http').astype(int)
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
    df['is_reply'] = df['in_reply_to_user_id'].notna().astype(int)
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))
    df['retweet_like_ratio'] = df['retweet_count'] / (df['like_count'] + 1)
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['unique_words_ratio'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['tweet_frequency'] = df.groupby('author_id')['created_at'].transform('count')
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in '.,;:!?'))
    df['emoji_count'] = df['text'].apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', x)))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text'].apply(lambda x: x.count('?'))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))
    df['digit_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['special_char_count'] = df['text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', x)))
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_char_ratio'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['tweet_hour_bin'] = pd.cut(df['hour'], bins=4, labels=[0,1,2,3])
    df['tweet_day_bin'] = pd.cut(df['day_of_week'], bins=3, labels=[0,1,2])
    df['text_complexity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['char_count'] = df['text'].apply(len)
    df['word_density'] = df['text'].apply(lambda x: len(x.split()) / (len(x) + 1))
    return df

def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

class XGBoostProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, display_interval=100, total_iterations=1000):
        self.display_interval = display_interval
        self.total_iterations = total_iterations

    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.display_interval == 0 or (epoch + 1) == self.total_iterations:
            print(f'XGBoost progress: {epoch + 1}/{self.total_iterations} iterations')
        return False

print("データの読み込みと前処理を開始します...")
df = pd.read_csv('twibot_tweet0_new.csv')
df = extract_features(df)
print("前処理が完了しました。")

X_text = df['text']
numeric_features = [col for col in df.columns if col not in ['text', 'label', 'author_id', 'created_at', 'in_reply_to_user_id']]
X_numeric = df[numeric_features]
y = df['label'].map({'human': 0, 'bot': 1})

X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

print("BERTモデルを使用して文章の埋め込みを生成します...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
X_train_bert = get_bert_embeddings(X_train_text, model, tokenizer)
X_test_bert = get_bert_embeddings(X_test_text, model, tokenizer)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_numeric_scaled, X_train_bert])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_numeric_scaled, X_test_bert])

variance_threshold = VarianceThreshold(threshold=0.001)
X_train_var_selected = variance_threshold.fit_transform(X_train_combined)
X_test_var_selected = variance_threshold.transform(X_test_combined)

selector = SelectKBest(f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_var_selected, y_train)
X_test_selected = selector.transform(X_test_var_selected)


smote = SMOTE(sampling_strategy='auto', random_state=42)
enn = EditedNearestNeighbours()
X_smote, y_smote = smote.fit_resample(X_train_selected, y_train)
X_train_resampled, y_train_resampled = enn.fit_resample(X_smote, y_smote)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=2000, max_depth=50, min_samples_split=4, min_samples_leaf=2, class_weight='balanced', random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=2000, learning_rate=0.02, max_depth=10, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, random_state=42, callbacks=[XGBoostProgressCallback(display_interval=200, total_iterations=2000)], n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, min_samples_split=4, min_samples_leaf=2, random_state=42),
    'SVM': SVC(kernel='rbf', C=100, gamma='scale', probability=True, class_weight='balanced', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(400, 200, 100), max_iter=2000, alpha=0.0001, learning_rate='adaptive', random_state=42)
}

print("モデルの学習を開始します...")
for name, model in tqdm(models.items()):
    print(f"{name}の学習を開始します...")
    start_time = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    print(f"{name}の学習が完了しました。学習時間: {end_time - start_time:.2f}秒")

print("アンサンブル予測を行います...")
y_pred_rf = models['RandomForest'].predict_proba(X_test_selected)[:, 1]
y_pred_xgb = models['XGBoost'].predict_proba(X_test_selected)[:, 1]
y_pred_gb = models['GradientBoosting'].predict_proba(X_test_selected)[:, 1]
y_pred_svm = models['SVM'].predict_proba(X_test_selected)[:, 1]
y_pred_mlp = models['MLP'].predict_proba(X_test_selected)[:, 1]

y_pred_ensemble = 0.3 * y_pred_rf + 0.3 * y_pred_xgb + 0.2 * y_pred_gb + 0.1 * y_pred_svm + 0.1 * y_pred_mlp

def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 1.0, 0.01)
    f1_scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
    return thresholds[np.argmax(f1_scores)]

optimal_threshold = find_optimal_threshold(y_test, y_pred_ensemble)
y_pred_final = (y_pred_ensemble > optimal_threshold).astype(int)

print(f"最適な閾値: {optimal_threshold}")
print("最終的な予測結果:")
print(classification_report(y_test, y_pred_final, target_names=['human', 'bot']))


データの読み込みと前処理を開始します...
前処理が完了しました。
BERTモデルを使用して文章の埋め込みを生成します...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [08:05<00:00, 16.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:58<00:00, 16.94it/s]


モデルの学習を開始します...


  0%|                                                                                                         | 0/5 [00:00<?, ?it/s]

RandomForestの学習を開始します...


 20%|███████████████████▏                                                                            | 1/5 [03:52<15:30, 232.54s/it]

RandomForestの学習が完了しました。学習時間: 232.53秒
XGBoostの学習を開始します...
XGBoost progress: 200/2000 iterations
XGBoost progress: 400/2000 iterations
XGBoost progress: 600/2000 iterations
XGBoost progress: 800/2000 iterations
XGBoost progress: 1000/2000 iterations
XGBoost progress: 1200/2000 iterations
XGBoost progress: 1400/2000 iterations
XGBoost progress: 1600/2000 iterations
XGBoost progress: 1800/2000 iterations


 40%|██████████████████████████████████████▍                                                         | 2/5 [07:44<11:36, 232.14s/it]

XGBoost progress: 2000/2000 iterations
XGBoostの学習が完了しました。学習時間: 231.87秒
GradientBoostingの学習を開始します...


 60%|██████████████████████████████████████████████████████▌                                    | 3/5 [1:56:27<1:43:29, 3104.81s/it]

GradientBoostingの学習が完了しました。学習時間: 6523.27秒
SVMの学習を開始します...


 80%|██████████████████████████████████████████████████████████████████████████▍                  | 4/5 [2:02:31<33:42, 2022.59s/it]

SVMの学習が完了しました。学習時間: 363.54秒
MLPの学習を開始します...


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [2:03:42<00:00, 1484.56s/it]

MLPの学習が完了しました。学習時間: 71.54秒
アンサンブル予測を行います...





最適な閾値: 0.20999999999999996
最終的な予測結果:
              precision    recall  f1-score   support

       human       0.92      0.97      0.95      1796
         bot       0.54      0.30      0.39       204

    accuracy                           0.90      2000
   macro avg       0.73      0.64      0.67      2000
weighted avg       0.89      0.90      0.89      2000



In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline as ImbPipeline
import re
from textblob import TextBlob
from tqdm import tqdm
import time
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb

def extract_features(df):
    df['text_length'] = df['text'].str.len()
    df['has_url'] = df['text'].str.contains('http').astype(int)
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
    df['is_reply'] = df['in_reply_to_user_id'].notna().astype(int)
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))
    df['retweet_like_ratio'] = df['retweet_count'] / (df['like_count'] + 1)
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['unique_words_ratio'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['tweet_frequency'] = df.groupby('author_id')['created_at'].transform('count')
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in '.,;:!?'))
    df['emoji_count'] = df['text'].apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', x)))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text'].apply(lambda x: x.count('?'))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))
    df['digit_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['special_char_count'] = df['text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', x)))
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_char_ratio'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['tweet_hour_bin'] = pd.cut(df['hour'], bins=4, labels=[0,1,2,3])
    df['tweet_day_bin'] = pd.cut(df['day_of_week'], bins=3, labels=[0,1,2])
    df['text_complexity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['char_count'] = df['text'].apply(len)
    df['word_density'] = df['text'].apply(lambda x: len(x.split()) / (len(x) + 1))
    df['punctuation_ratio'] = df['punctuation_count'] / df['char_count']
    df['url_to_text_ratio'] = df['url_count'] / df['text_length']
    df['hashtag_to_text_ratio'] = df['hashtag_count'] / df['text_length']
    df['mention_to_text_ratio'] = df['mention_count'] / df['text_length']
    return df

def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

class XGBoostProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, display_interval=100, total_iterations=1000):
        self.display_interval = display_interval
        self.total_iterations = total_iterations

    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.display_interval == 0 or (epoch + 1) == self.total_iterations:
            print(f'XGBoost progress: {epoch + 1}/{self.total_iterations} iterations')
        return False

print("データの読み込みと前処理を開始します...")
df = pd.read_csv('twibot_tweet0_new.csv')
df = extract_features(df)
print("前処理が完了しました。")

X_text = df['text']
numeric_features = [col for col in df.columns if col not in ['text', 'label', 'author_id', 'created_at', 'in_reply_to_user_id']]
X_numeric = df[numeric_features]
y = df['label'].map({'human': 0, 'bot': 1})

X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=35000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

print("BERTモデルを使用して文章の埋め込みを生成します...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
X_train_bert = get_bert_embeddings(X_train_text, model, tokenizer)
X_test_bert = get_bert_embeddings(X_test_text, model, tokenizer)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_numeric_scaled, X_train_bert])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_numeric_scaled, X_test_bert])

variance_threshold = VarianceThreshold(threshold=0.0005)
X_train_var_selected = variance_threshold.fit_transform(X_train_combined)
X_test_var_selected = variance_threshold.transform(X_test_combined)

selector = SelectKBest(f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_var_selected, y_train)
X_test_selected = selector.transform(X_test_var_selected)

smote = SMOTE(sampling_strategy=0.8, random_state=42)
enn = EditedNearestNeighbours(sampling_strategy='auto')
steps = [('over', smote), ('under', enn)]
pipeline = ImbPipeline(steps=steps)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_selected, y_train)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=2500, max_depth=60, min_samples_split=3, min_samples_leaf=1, class_weight='balanced', random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=2500, learning_rate=0.01, max_depth=12, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=2, random_state=42, callbacks=[XGBoostProgressCallback(display_interval=250, total_iterations=2500)], n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=800, learning_rate=0.03, max_depth=10, min_samples_split=3, min_samples_leaf=1, random_state=42),
    'SVM': SVC(kernel='rbf', C=150, gamma='scale', probability=True, class_weight='balanced', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(500, 250, 125), max_iter=2500, alpha=0.00005, learning_rate='adaptive', random_state=42)
}

print("モデルの学習を開始します...")
for name, model in tqdm(models.items()):
    print(f"{name}の学習を開始します...")
    start_time = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    print(f"{name}の学習が完了しました。学習時間: {end_time - start_time:.2f}秒")

print("アンサンブル予測を行います...")
y_pred_rf = models['RandomForest'].predict_proba(X_test_selected)[:, 1]
y_pred_xgb = models['XGBoost'].predict_proba(X_test_selected)[:, 1]
y_pred_gb = models['GradientBoosting'].predict_proba(X_test_selected)[:, 1]
y_pred_svm = models['SVM'].predict_proba(X_test_selected)[:, 1]
y_pred_mlp = models['MLP'].predict_proba(X_test_selected)[:, 1]

y_pred_ensemble = 0.3 * y_pred_rf + 0.3 * y_pred_xgb + 0.2 * y_pred_gb + 0.1 * y_pred_svm + 0.1 * y_pred_mlp

def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 1.0, 0.01)
    f1_scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
    return thresholds[np.argmax(f1_scores)]

optimal_threshold = find_optimal_threshold(y_test, y_pred_ensemble)
y_pred_final = (y_pred_ensemble > optimal_threshold).astype(int)

print(f"最適な閾値: {optimal_threshold}")
print("最終的な予測結果:")
print(classification_report(y_test, y_pred_final, target_names=['human', 'bot']))


データの読み込みと前処理を開始します...
前処理が完了しました。
BERTモデルを使用して文章の埋め込みを生成します...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [07:31<00:00, 17.72it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:43<00:00, 19.31it/s]


モデルの学習を開始します...


  0%|                                                                                                         | 0/5 [00:00<?, ?it/s]

RandomForestの学習を開始します...


 20%|███████████████████▏                                                                            | 1/5 [01:52<07:28, 112.11s/it]

RandomForestの学習が完了しました。学習時間: 112.10秒
XGBoostの学習を開始します...
XGBoost progress: 250/2500 iterations
XGBoost progress: 500/2500 iterations
XGBoost progress: 750/2500 iterations
XGBoost progress: 1000/2500 iterations
XGBoost progress: 1250/2500 iterations
XGBoost progress: 1500/2500 iterations
XGBoost progress: 1750/2500 iterations
XGBoost progress: 2000/2500 iterations
XGBoost progress: 2250/2500 iterations


 40%|██████████████████████████████████████▍                                                         | 2/5 [05:12<08:11, 163.77s/it]

XGBoost progress: 2500/2500 iterations
XGBoostの学習が完了しました。学習時間: 199.93秒
GradientBoostingの学習を開始します...


 60%|██████████████████████████████████████████████████████▌                                    | 3/5 [1:44:58<1:34:05, 2822.54s/it]

GradientBoostingの学習が完了しました。学習時間: 5986.47秒
SVMの学習を開始します...


 80%|██████████████████████████████████████████████████████████████████████████▍                  | 4/5 [1:45:46<28:46, 1726.99s/it]

SVMの学習が完了しました。学習時間: 47.52秒
MLPの学習を開始します...


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [1:46:15<00:00, 1275.03s/it]

MLPの学習が完了しました。学習時間: 29.09秒
アンサンブル予測を行います...





最適な閾値: 0.8899999999999996
最終的な予測結果:
              precision    recall  f1-score   support

       human       0.93      0.82      0.87      1796
         bot       0.22      0.44      0.29       204

    accuracy                           0.78      2000
   macro avg       0.57      0.63      0.58      2000
weighted avg       0.86      0.78      0.81      2000



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline as ImbPipeline
import re
from textblob import TextBlob
from tqdm import tqdm
import time
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb

def extract_features(df):
    df['text_length'] = df['text'].str.len()
    df['has_url'] = df['text'].str.contains('http').astype(int)
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
    df['is_reply'] = df['in_reply_to_user_id'].notna().astype(int)
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))
    df['retweet_like_ratio'] = df['retweet_count'] / (df['like_count'] + 1)
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['unique_words_ratio'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['tweet_frequency'] = df.groupby('author_id')['created_at'].transform('count')
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in '.,;:!?'))
    df['emoji_count'] = df['text'].apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', x)))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text'].apply(lambda x: x.count('?'))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))
    df['digit_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['special_char_count'] = df['text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', x)))
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_char_ratio'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['tweet_hour_bin'] = pd.cut(df['hour'], bins=4, labels=[0,1,2,3])
    df['tweet_day_bin'] = pd.cut(df['day_of_week'], bins=3, labels=[0,1,2])
    df['text_complexity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['char_count'] = df['text'].apply(len)
    df['word_density'] = df['text'].apply(lambda x: len(x.split()) / (len(x) + 1))
    df['punctuation_ratio'] = df['punctuation_count'] / df['char_count']
    df['url_to_text_ratio'] = df['url_count'] / df['text_length']
    df['hashtag_to_text_ratio'] = df['hashtag_count'] / df['text_length']
    df['mention_to_text_ratio'] = df['mention_count'] / df['text_length']
    df['tweet_frequency_per_day'] = df.groupby(['author_id', 'day_of_week'])['created_at'].transform('count')
    df['retweet_ratio'] = df['retweet_count'] / (df['retweet_count'] + df['like_count'] + 1)
    return df

def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

class XGBoostProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, display_interval=100, total_iterations=1000):
        self.display_interval = display_interval
        self.total_iterations = total_iterations

    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.display_interval == 0 or (epoch + 1) == self.total_iterations:
            print(f'XGBoost progress: {epoch + 1}/{self.total_iterations} iterations')
        return False

print("データの読み込みと前処理を開始します...")
df = pd.read_csv('twidata.csv')
df = extract_features(df)
print("前処理が完了しました。")

X_text = df['text']
numeric_features = [col for col in df.columns if col not in ['text', 'label', 'author_id', 'created_at', 'in_reply_to_user_id']]
X_numeric = df[numeric_features]
y = df['label'].map({'human': 0, 'bot': 1})

X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=40000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

print("BERTモデルを使用して文章の埋め込みを生成します...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
X_train_bert = get_bert_embeddings(X_train_text, model, tokenizer)
X_test_bert = get_bert_embeddings(X_test_text, model, tokenizer)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_numeric_scaled, X_train_bert])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_numeric_scaled, X_test_bert])

variance_threshold = VarianceThreshold(threshold=0.0001)
X_train_var_selected = variance_threshold.fit_transform(X_train_combined)
X_test_var_selected = variance_threshold.transform(X_test_combined)

selector = SelectKBest(f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_var_selected, y_train)
X_test_selected = selector.transform(X_test_var_selected)

smote = SMOTE(sampling_strategy=0.7, random_state=42)
enn = EditedNearestNeighbours(sampling_strategy='auto')
steps = [('over', smote), ('under', enn)]
pipeline = ImbPipeline(steps=steps)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_selected, y_train)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=3000, max_depth=70, min_samples_split=2, min_samples_leaf=1, class_weight='balanced', random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=3000, learning_rate=0.01, max_depth=15, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=3, random_state=42, callbacks=[XGBoostProgressCallback(display_interval=300, total_iterations=3000)], n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=1000, learning_rate=0.02, max_depth=12, min_samples_split=2, min_samples_leaf=1, random_state=42),
    'SVM': SVC(kernel='rbf', C=200, gamma='scale', probability=True, class_weight='balanced', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(600, 300, 150), max_iter=3000, alpha=0.00001, learning_rate='adaptive', random_state=42)
}

print("モデルの学習を開始します...")
for name, model in tqdm(models.items()):
    print(f"{name}の学習を開始します...")
    start_time = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    print(f"{name}の学習が完了しました。学習時間: {end_time - start_time:.2f}秒")

print("アンサンブル予測を行います...")
y_pred_rf = models['RandomForest'].predict_proba(X_test_selected)[:, 1]
y_pred_xgb = models['XGBoost'].predict_proba(X_test_selected)[:, 1]
y_pred_gb = models['GradientBoosting'].predict_proba(X_test_selected)[:, 1]
y_pred_svm = models['SVM'].predict_proba(X_test_selected)[:, 1]
y_pred_mlp = models['MLP'].predict_proba(X_test_selected)[:, 1]

y_pred_ensemble = 0.3 * y_pred_rf + 0.3 * y_pred_xgb + 0.2 * y_pred_gb + 0.1 * y_pred_svm + 0.1 * y_pred_mlp

def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 1.0, 0.005)
    f1_scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
    return thresholds[np.argmax(f1_scores)]

optimal_threshold = find_optimal_threshold(y_test, y_pred_ensemble)
y_pred_final = (y_pred_ensemble > optimal_threshold).astype(int)

print(f"最適な閾値: {optimal_threshold}")
print("最終的な予測結果:")
print(classification_report(y_test, y_pred_final, target_names=['human', 'bot']))


データの読み込みと前処理を開始します...
前処理が完了しました。
BERTモデルを使用して文章の埋め込みを生成します...


100%|█████████████████████████████████████████████████████████████████████████████████████████| 24000/24000 [21:44<00:00, 18.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 6000/6000 [05:35<00:00, 17.88it/s]


モデルの学習を開始します...


  0%|                                                                                                         | 0/5 [00:00<?, ?it/s]

RandomForestの学習を開始します...


 20%|███████████████████▏                                                                            | 1/5 [09:01<36:06, 541.69s/it]

RandomForestの学習が完了しました。学習時間: 541.67秒
XGBoostの学習を開始します...
XGBoost progress: 300/3000 iterations
XGBoost progress: 600/3000 iterations
XGBoost progress: 900/3000 iterations
XGBoost progress: 1200/3000 iterations
XGBoost progress: 1500/3000 iterations
XGBoost progress: 1800/3000 iterations
XGBoost progress: 2100/3000 iterations
XGBoost progress: 2400/3000 iterations
XGBoost progress: 2700/3000 iterations
XGBoost progress: 3000/3000 iterations


 40%|██████████████████████████████████████▍                                                         | 2/5 [25:25<40:06, 802.04s/it]

XGBoostの学習が完了しました。学習時間: 984.29秒
GradientBoostingの学習を開始します...


# 以下はtwidata.csvを用いた学習

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline as ImbPipeline
import re
from textblob import TextBlob
from tqdm import tqdm
import time
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb

def extract_features(df):
    df['text_length'] = df['text'].str.len()
    df['has_url'] = df['text'].str.contains('http').astype(int)
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
    df['is_reply'] = df['in_reply_to_user_id'].notna().astype(int)
    df['hashtag_count'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
    df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', x)))
    df['retweet_like_ratio'] = df['retweet_count'] / (df['like_count'] + 1)
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['caps_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['unique_words_ratio'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    df['tweet_frequency'] = df.groupby('author_id')['created_at'].transform('count')
    df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in '.,;:!?'))
    df['emoji_count'] = df['text'].apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', x)))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text'].apply(lambda x: x.count('?'))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))
    df['digit_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['special_char_count'] = df['text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', x)))
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_char_ratio'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['tweet_hour_bin'] = pd.cut(df['hour'], bins=4, labels=[0,1,2,3])
    df['tweet_day_bin'] = pd.cut(df['day_of_week'], bins=3, labels=[0,1,2])
    df['text_complexity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
    return df

def get_bert_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

class XGBoostProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, display_interval=100, total_iterations=1000):
        self.display_interval = display_interval
        self.total_iterations = total_iterations

    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.display_interval == 0 or (epoch + 1) == self.total_iterations:
            print(f'XGBoost progress: {epoch + 1}/{self.total_iterations} iterations')
        return False

print("データの読み込みと前処理を開始します...")
df = pd.read_csv('twidata.csv')
df = extract_features(df)
print("前処理が完了しました。")

X_text = df['text']
numeric_features = [col for col in df.columns if col not in ['text', 'label', 'author_id', 'created_at', 'in_reply_to_user_id']]
X_numeric = df[numeric_features]
y = df['label'].map({'human': 0, 'bot': 1})

# データ分割比率を調整
X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X_text, X_numeric, y, test_size=0.1, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=35000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

print("BERTモデルを使用して文章の埋め込みを生成します...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
X_train_bert = get_bert_embeddings(X_train_text, model, tokenizer)
X_test_bert = get_bert_embeddings(X_test_text, model, tokenizer)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_numeric_scaled, X_train_bert])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_numeric_scaled, X_test_bert])

variance_threshold = VarianceThreshold(threshold=0.001)
X_train_var_selected = variance_threshold.fit_transform(X_train_combined)
X_test_var_selected = variance_threshold.transform(X_test_combined)

selector = SelectKBest(f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_var_selected, y_train)
X_test_selected = selector.transform(X_test_var_selected)

smote = SMOTE(sampling_strategy=0.7, random_state=42)
enn = EditedNearestNeighbours(sampling_strategy='auto')
steps = [('over', smote), ('under', enn)]
pipeline = ImbPipeline(steps=steps)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_selected, y_train)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=2000, max_depth=50, min_samples_split=4, min_samples_leaf=2, class_weight='balanced', random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=2000, learning_rate=0.02, max_depth=10, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=2, random_state=42, callbacks=[XGBoostProgressCallback(display_interval=200, total_iterations=2000)], n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, min_samples_split=4, min_samples_leaf=2, random_state=42),
    'SVM': SVC(kernel='rbf', C=100, gamma='scale', probability=True, class_weight='balanced', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(400, 200, 100), max_iter=2000, alpha=0.0001, learning_rate='adaptive', random_state=42)
}

print("モデルの学習を開始します...")
for name, model in tqdm(models.items()):
    print(f"{name}の学習を開始します...")
    start_time = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    print(f"{name}の学習が完了しました。学習時間: {end_time - start_time:.2f}秒")

print("アンサンブル予測を行います...")
y_pred_rf = models['RandomForest'].predict_proba(X_test_selected)[:, 1]
y_pred_xgb = models['XGBoost'].predict_proba(X_test_selected)[:, 1]
y_pred_gb = models['GradientBoosting'].predict_proba(X_test_selected)[:, 1]
y_pred_svm = models['SVM'].predict_proba(X_test_selected)[:, 1]
y_pred_mlp = models['MLP'].predict_proba(X_test_selected)[:, 1]

y_pred_ensemble = 0.3 * y_pred_rf + 0.3 * y_pred_xgb + 0.2 * y_pred_gb + 0.1 * y_pred_svm + 0.1 * y_pred_mlp

def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 1.0, 0.01)
    f1_scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
    return thresholds[np.argmax(f1_scores)]

optimal_threshold = find_optimal_threshold(y_test, y_pred_ensemble)
y_pred_final = (y_pred_ensemble > optimal_threshold).astype(int)

print(f"最適な閾値: {optimal_threshold}")
print("最終的な予測結果:")
print(classification_report(y_test, y_pred_final, target_names=['human', 'bot']))


データの読み込みと前処理を開始します...
前処理が完了しました。
BERTモデルを使用して文章の埋め込みを生成します...


100%|█████████████████████████████████████████████████████████████████████████████████████████| 27000/27000 [25:33<00:00, 17.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [02:38<00:00, 18.97it/s]


モデルの学習を開始します...


  0%|                                                                                                         | 0/5 [00:00<?, ?it/s]

RandomForestの学習を開始します...


 20%|███████████████████▏                                                                            | 1/5 [07:52<31:30, 472.63s/it]

RandomForestの学習が完了しました。学習時間: 472.61秒
XGBoostの学習を開始します...
XGBoost progress: 200/2000 iterations
XGBoost progress: 400/2000 iterations
XGBoost progress: 600/2000 iterations
XGBoost progress: 800/2000 iterations
XGBoost progress: 1000/2000 iterations
XGBoost progress: 1200/2000 iterations
XGBoost progress: 1400/2000 iterations
XGBoost progress: 1600/2000 iterations
XGBoost progress: 1800/2000 iterations


 40%|██████████████████████████████████████▍                                                         | 2/5 [12:17<17:30, 350.20s/it]

XGBoost progress: 2000/2000 iterations
XGBoostの学習が完了しました。学習時間: 264.49秒
GradientBoostingの学習を開始します...


 60%|██████████████████████████████████████████████████████▌                                    | 3/5 [3:13:43<2:52:02, 5161.19s/it]

GradientBoostingの学習が完了しました。学習時間: 10886.26秒
SVMの学習を開始します...


 80%|██████████████████████████████████████████████████████████████████████████▍                  | 4/5 [3:21:25<55:06, 3306.04s/it]

SVMの学習が完了しました。学習時間: 462.10秒
MLPの学習を開始します...


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [3:23:04<00:00, 2436.82s/it]

MLPの学習が完了しました。学習時間: 98.60秒
アンサンブル予測を行います...





最適な閾値: 0.8399999999999996
最終的な予測結果:
              precision    recall  f1-score   support

       human       0.95      0.84      0.89      2795
         bot       0.17      0.44      0.24       205

    accuracy                           0.81      3000
   macro avg       0.56      0.64      0.57      3000
weighted avg       0.90      0.81      0.85      3000

