In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend

In [5]:
def feature_selection(df, feature_names, target='origin', top_n=20):
    mi_scores = mutual_info_classif(df[feature_names], df[target])
    mi_df = pd.DataFrame({'feature': feature_names, 'mi_score': mi_scores})
    return mi_df.sort_values('mi_score', ascending=False).head(top_n)


def train_models(X_train, y_train):
    models = {
        'SVM': SVC(kernel='linear', probability=True, random_state=42),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
    }
    
    trained_models = {}
    
    # Cria barra de progresso personalizada
    with tqdm(models.items(), desc="Treinando modelos", unit="modelo") as pbar:
        for name, model in pbar:
            pbar.set_postfix_str(f"Treinando {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model
            pbar.set_postfix_str(f"{name} treinado")
    
    return trained_models


def evaluate_models(models, X_test, y_test, feature_names=None):
    results = {}
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        
        # Importância das características (quando aplicável)
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
        else:
            importances = None
        
        results[name] = {
            'report': report,
            'confusion_matrix': cm,
            'feature_importances': importances
        }
    
    return results

def plot_results(cm, importances, feature_names):
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 2, 1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Human', 'ChatGPT'], 
                yticklabels=['Human', 'ChatGPT'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    '''
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False).head(10)
    
    plt.subplot(1, 2, 2)
    sns.barplot(data=importance_df, y='feature', x='importance', palette='viridis')
    plt.title('Top 10 Important Features')
    plt.tight_layout()
    plt.show()'''

In [6]:
df = pd.read_csv("df_pronto/df_gpt_com_features.csv")

In [7]:
feature_columns = [col for col in df.columns if col not in ['text', 'origin']]
target = 'origin' 

In [8]:
df = df.fillna(0)

In [9]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

def feature_selection_chunked(df, feature_names, target='origin', top_n=20, 
                            n_jobs=-1, chunk_size=100):
    """
    Versão otimizada com chunks e barra de progresso
    
    Parâmetros:
    - chunk_size: número de features por chunk (ajuste conforme sua RAM)
    """
    X = df[feature_names].values
    y = df[target].values
    n_features = len(feature_names)
    
    # Divide as features em chunks
    chunks = [feature_names[i:i + chunk_size] 
             for i in range(0, n_features, chunk_size)]
    
    # Função para processar cada chunk
    def process_chunk(chunk_features):
        chunk_idx = [feature_names.index(f) for f in chunk_features]
        return mutual_info_classif(
            X[:, chunk_idx], 
            y,
            n_jobs=1  # Usa 1 núcleo por chunk
        )
    
    # Paraleliza com barra de progresso
    print(f"Processando {len(chunks)} chunks de {chunk_size} features cada...")
    results = []
    with tqdm(total=len(chunks), desc="Processando chunks") as pbar:
        for result in Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks):
            results.extend(result)
            pbar.update(1)
    
    # Combina resultados
    mi_df = pd.DataFrame({
        'feature': feature_names,
        'mi_score': results
    }).sort_values('mi_score', ascending=False)
    
    return mi_df.head(top_n)

In [10]:
# Exemplo de uso:
top_features = feature_selection_chunked(
    df=df,
    feature_names=feature_columns,
    target='origin',
    top_n=20,
    n_jobs=-1,  # Usa todos os núcleos
    chunk_size=100  # Ajuste conforme sua RAM
)
top_features

Processando 19 chunks de 100 features cada...


Processando chunks: 100%|██████████| 19/19 [04:14<00:00, 13.40s/it]  


Unnamed: 0,feature,mi_score
23,structural_has_hashtag,0.307027
27,structural_hashtag_density,0.294874
11,syntactic_punct_ratio,0.293137
4,lexical_word_length_variance,0.283829
6,syntactic_pos_tag_entropy,0.227542
212,https,0.224396
1547,modi,0.198775
116,co,0.192944
21,structural_has_url,0.191751
213,https co,0.188484


In [11]:
top_features.to_csv("df_pronto/top_features_gpt.csv", index=False)

In [12]:
X = df[top_features['feature'].tolist()]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [21]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [22]:
y_pred = log_reg.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

In [23]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.96      0.97      0.96     30000\n           1       0.97      0.96      0.96     30000\n\n    accuracy                           0.96     60000\n   macro avg       0.96      0.96      0.96     60000\nweighted avg       0.96      0.96      0.96     60000\n'

In [24]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Definindo o modelo
clf = RandomForestClassifier(random_state=42)

# Definindo métricas
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Validação cruzada (exemplo com 5 folds)
results = cross_validate(clf, X, y, cv=5, scoring=scoring)

# Exibindo resultados médios e desvio padrão
for metric in scoring.keys():
    print(f"{metric}: {np.mean(results['test_' + metric]):.4f} ± {np.std(results['test_' + metric]):.4f}")


accuracy: 0.9758 ± 0.0174
precision: 0.9814 ± 0.0200
recall: 0.9702 ± 0.0236
f1: 0.9756 ± 0.0176


In [None]:
model = SVC(kernel='linear', probability=True, random_state=42)

# Validação cruzada
results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Exibindo resultados médios e desvio padrão
for metric in scoring.keys():
    print(f"{metric}: {np.mean(results['test_' + metric]):.4f} ± {np.std(results['test_' + metric]):.4f}")

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)

# Validação cruzada
results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Exibindo resultados médios e desvio padrão
for metric in scoring.keys():
    print(f"{metric}: {np.mean(results['test_' + metric]):.4f} ± {np.std(results['test_' + metric]):.4f}")