# Sklearn Models

In [1]:
import pandas as pd
df = pd.read_parquet("../data/train_preprocessed_small.parquet")
display(df)

Unnamed: 0,label,preprocessed_text
0,1,house dem aide comey letter jason chaffetz twe...
1,0,flynn hillary clinton big woman campus breitba...
2,1,truth fire truth fire october tension intellig...
3,1,civilian kill single airstrike identify video ...
4,1,iranian woman jail fictional unpublished story...
...,...,...
20795,0,rapper t trump poster child white supremacy ra...
20796,0,n f l playoff schedule matchup odd new york ti...
20797,0,macys receive takeover approach hudsons bay ne...
20798,1,nato russia hold parallel exercise balkans nat...


In [2]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df["preprocessed_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import joblib

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)


# Define the models
models = {
    'Logistic_Regression': LogisticRegression(),
    'Naive_Bayes': MultinomialNB(),
    'Passive_Aggressive': PassiveAggressiveClassifier(),
    'Support Vector Machine': SVC(probability=True),

}

# Train, save, and evaluate the models
for model_name, model in models.items():
    print(f'Training {model_name}...')
    pipeline = make_pipeline(tfidf_vectorizer, model)
    pipeline.fit(X_train, y_train)
    # Save the trained model
    joblib.dump(pipeline, f'{model_name}_model.pkl')

    # Evaluate the model
    y_pred = pipeline.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(f'{model_name} accuracy: {accuracy:.4f}')


Training Logistic_Regression...
Logistic_Regression accuracy: 0.9488
Training Naive_Bayes...
Naive_Bayes accuracy: 0.8906
Training Passive_Aggressive...
Passive_Aggressive accuracy: 0.9517


In [1]:
import fasttext
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import optuna
import joblib
from huggingface_hub import hf_hub_download
import os
import fasttext.util


fasttext.util.download_model('en', if_exists='ignore')  # English
vectorizer = fasttext.load_model('cc.en.300.bin')

# Define the models
models = {
    'Decision Tree': DecisionTreeClassifier(),
#     'Random Forest': RandomForestClassifier(),
#     'Gradient Boosting': GradientBoostingClassifier(),
#     'LightGBM': lgb.LGBMClassifier()
}

X_train_embedded = vectorizer.fit_transform(X_train)
X_valid_embedded = vectorizer.transform(X_valid)

# Define a pipeline that uses FastText vectorization + model
# for model_name, model in models.items():
#     print(f'Training {model_name}...')
    
#     model.fit(X_train_embedded, y_train)

#     # Save the trained model
#     joblib.dump(model, f'{model_name}_model.pkl')

#     # Evaluate the model
#     y_pred = model.predict(X_valid)
#     accuracy = accuracy_score(y_valid, y_pred)
#     print(f'{model_name} accuracy: {accuracy:.4f}')

# Hyperparameter Optimization with Optuna
def objective(trial):
    model_name = trial.suggest_categorical('model_name', ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'LightGBM'])
    
    # Hyperparameters for different models
    if model_name == 'Decision Tree':
        max_depth = trial.suggest_int('max_depth', 1, 20)
        classifier = DecisionTreeClassifier(max_depth=max_depth)
    elif model_name == 'Random Forest':
        n_estimators = trial.suggest_int('n_estimators', 10, 100)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == 'Gradient Boosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    elif model_name == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        classifier = lgb.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate)

    classifier.fit(X_train_embedded, y_train)
    
    y_pred = classifier.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    return accuracy

# Use Optuna for hyperparameter tuning
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)



# Print the best parameters found by Optuna
print(f"Best parameters: {study.best_params}")


  from .autonotebook import tqdm as notebook_tqdm


AttributeError: '_FastText' object has no attribute 'fit_transform'