In [1]:
# train_model.py

import pandas as pd
import re
import os
import joblib
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Constants
DATA_PATH = "train.csv"
MODEL_PATH = "best_model.joblib"
VECTORIZER_PATH = "vectorizer.joblib"
PARAMS_PATH = "best_params.txt"

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

# Load and preprocess data
def load_data():
    df = pd.read_csv(DATA_PATH)
    df.dropna(inplace=True)
    df["sms"] = df["sms"].apply(clean_text)
    return df["sms"], df["label"]

# Training with GridSearchCV and MLflow logging
def train_model():
    X_texts, y = load_data()
    
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(X_texts)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    }

    with mlflow.start_run():
        grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy")
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        # Log to MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(best_model, "model")

        # Save artifacts
        joblib.dump(best_model, MODEL_PATH)
        joblib.dump(vectorizer, VECTORIZER_PATH)

        with open(PARAMS_PATH, "w") as f:
            f.write("RandomForestClassifier with GridSearchCV\n")
            for k, v in grid_search.best_params_.items():
                f.write(f"{k}={v}\n")
            f.write(f"accuracy={acc}\n")

        print("Training complete.")
        print("Best Parameters:", grid_search.best_params_)
        print(f"Accuracy: {acc:.4f}")

if __name__ == "__main__":
    train_model()




Training complete.
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Accuracy: 0.9740


In [2]:
# train_model_pickle.py

import pandas as pd
import re
import os
import pickle
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# File paths
DATA_PATH = "train.csv"
MODEL_PATH = "best_model.pkl"
VECTORIZER_PATH = "vectorizer.pkl"
PARAMS_PATH = "best_params.txt"

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\\s]", "", text)
    return text

# Load and preprocess data
def load_data():
    df = pd.read_csv(DATA_PATH)
    df.dropna(inplace=True)
    df["sms"] = df["sms"].apply(clean_text)
    return df["sms"], df["label"]

# Train model with tuning and log using MLflow
def train_model():
    X_texts, y = load_data()
    
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(X_texts)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    }

    with mlflow.start_run():
        grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy")
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        # Log to MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(best_model, "model")

        # Save using pickle
        with open(MODEL_PATH, "wb") as f:
            pickle.dump(best_model, f)
        with open(VECTORIZER_PATH, "wb") as f:
            pickle.dump(vectorizer, f)

        # Save parameters
        with open(PARAMS_PATH, "w") as f:
            f.write("RandomForestClassifier with GridSearchCV\n")
            for k, v in grid_search.best_params_.items():
                f.write(f"{k}={v}\n")
            f.write(f"accuracy={acc}\n")

        print("✅ Model training complete")
        print("📌 Best Parameters:", grid_search.best_params_)
        print(f"🎯 Accuracy: {acc:.4f}")

if __name__ == "__main__":
    train_model()




✅ Model training complete
📌 Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
🎯 Accuracy: 0.9031
