In [None]:
# project.py
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from pydantic import BaseModel, Field, ValidationError
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# ── Configuration ──────────────────────────────────────────────────────────────

class Config(BaseModel):
    play_csv: Path = Field(..., description="Path to Google Play CSV")
    ios_csv: Path = Field(..., description="Path to Apple Store CSV")
    random_state: int = 42
    test_size: float = 0.3
    n_jobs: int = -1

# ── Logging Setup ─────────────────────────────────────────────────────────────

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(levelname)s %(message)s")

# ── Data Loading & Validation ─────────────────────────────────────────────────

def load_data(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    logging.info(f"Loaded {len(df):,} rows from {path.name}")
    return df

# ── Preprocessing Pipeline ────────────────────────────────────────────────────

def build_preprocessor(numerical_cols, categorical_cols):
    num_pipeline = Pipeline([
        ("impute_mean", SimpleImputer(strategy="mean")),
        ("scale", StandardScaler()),
    ])
    cat_pipeline = Pipeline([
        ("impute_const", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numerical_cols),
        ("cat", cat_pipeline, categorical_cols),
    ], remainder="drop")
    return preprocessor

# ── Modeling ──────────────────────────────────────────────────────────────────

def evaluate_models(X, y, preprocessor, config: Config):
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=config.random_state, n_jobs=config.n_jobs),
        "GradientBoosting": GradientBoostingClassifier(random_state=config.random_state),
    }
    results = {}
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=config.random_state)
    for name, model in models.items():
        pipe = Pipeline([
            ("preproc", preprocessor),
            ("clf", model),
        ])
        scores = cross_val_score(pipe, X, y, cv=cv, scoring="f1_weighted", n_jobs=config.n_jobs)
        results[name] = scores
        logging.info(f"{name} F1-weighted CV: {scores.mean():.3f} ± {scores.std():.3f}")
    return results

# ── Main Workflow ─────────────────────────────────────────────────────────────

def main():
    try:
        config = Config(
            play_csv=Path("googleplaystore.csv"),
            ios_csv=Path("AppleStore.csv"),
        )
    except ValidationError as e:
        logging.error("Configuration error:\n%s", e)
        return

    # Load and merge datasets if needed; here illustrate Play store only
    df = load_data(config.play_csv)
    # Filter to English and free apps
    df = df[df["Type"] == "Free"].copy()
    df = df[df["App"].str.encode("ascii", errors="ignore").str.len() - df["App"].str.len().abs() <= 3]
    # Drop duplicates, missing target
    df.drop_duplicates(subset=["App"], inplace=True)
    df.dropna(subset=["Rating", "Reviews", "Category"], inplace=True)

    # Feature & target
    df["Reviews"] = df["Reviews"].astype(int)
    df["Installs"] = df["Installs"].str.replace(r"[+,]", "", regex=True).astype(int)
    df["Price"] = df["Price"].astype(float)
    X = df[["Rating", "Reviews", "Installs", "Price", "Category", "Content Rating"]]
    y = df["Genres"]  # or other binary target

    numerical_cols = ["Rating", "Reviews", "Installs", "Price"]
    categorical_cols = ["Category", "Content Rating"]

    preprocessor = build_preprocessor(numerical_cols, categorical_cols)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=config.test_size, stratify=y, random_state=config.random_state
    )

    # Evaluate models
    evaluate_models(X_train, y_train, preprocessor, config)

    # Final fit and report
    final_pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf", RandomForestClassifier(n_estimators=200, random_state=config.random_state, n_jobs=config.n_jobs)),
    ])
    final_pipe.fit(X_train, y_train)
    y_pred = final_pipe.predict(X_test)
    logging.info("Final Confusion Matrix:\n%s", confusion_matrix(y_test, y_pred))
    logging.info("Final Classification Report:\n%s", classification_report(y_test, y_pred))

if __name__ == "__main__":
    main()


In [None]:
import joblib

# After training your best model (for example, Random Forest)
best_model = classifiers['Random Forest']
joblib.dump(best_model, 'model.h5')
print("Saved trained model to model.h5")


In [None]:
model.save('model.h5')
print("Saved Keras model to model.h5")
