In [7]:
import os
import logging
from flask import Flask, request, jsonify
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import html
import google.generativeai as genai
from pymongo import MongoClient

# Configuration du logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)
app = Flask(__name__)

# Initialiser le rate limiter
limiter = Limiter(
    get_remote_address,
    app=app,
    default_limits=["200 per day", "50 per hour"]  # tu peux changer ça
)

# Configuration MongoDB
MONGO_URI = os.getenv('MONGO_URI')
if not MONGO_URI:
    logger.error("MongoDB Atlas URI not found")
    raise ValueError("MongoDB Atlas URI not configured")
try:
    client = MongoClient(MONGO_URI)
    db = client['moviesDB']
    collection = db['movies1']
    logger.info("MongoDB connected successfully")
except Exception as e:
    logger.error(f"Failed to connect to MongoDB: {e}")
    raise

# Configuration Gemini API
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
if not GEMINI_API_KEY:
    logger.error("Gemini API key not found")
    raise ValueError("Gemini API key not configured")
try:
    genai.configure(api_key=GEMINI_API_KEY)
    SENTIMENT_MODEL = "gemini-1.5-flash"
    EMBEDDING_MODEL = "embedding-001"
    logger.info("Gemini API configured successfully")
except Exception as e:
    logger.error(f"Failed to configure Gemini API: {e}")
    raise

# Génération des embeddings si nécessaire (exécuter une seule fois)
def generate_embeddings():
    films = collection.find({"plot_embeddings": {"$exists": False}})
    batch_size = 100
    batch = []
    for film in films:
        text = f"{film.get('title', '')} {' '.join(film.get('genres', []))}".strip()
        if text:
            batch.append((film["_id"], text))
        if len(batch) >= batch_size:
            process_batch(batch)
            batch = []
    if batch:
        process_batch(batch)

def process_batch(batch):
    texts = [item[1] for item in batch]
    ids = [item[0] for item in batch]
    embeddings = [genai.embed_content(model=EMBEDDING_MODEL, content=text)['embedding'] for text in texts]
    for id, embedding in zip(ids, embeddings):
        collection.update_one({"_id": id}, {"$set": {"plot_embeddings": embedding}})
    logger.info(f"Traitement de {len(batch)} films terminé.")

# Générer embeddings au démarrage (une seule fois)
generate_embeddings()

@app.route('/api/recommend', methods=['POST'])
@limiter.limit("5 per minute")
def recommend_movies():
    """
    Handle a POST request to /api/recommend, which should contain a JSON object with two keys:
    - `prompt`: a string describing the movie style or genre to recommend.
    - `budget`: a string describing the budget category ("budget", "mid-range", "premium").
    """
    logger.info("Received movie recommendation request")
    try:
        if not request.is_json:
            logger.warning("Non-JSON request received")
            return jsonify({'error': 'Request must be JSON'}), 400

        data = request.get_json()
        prompt = data.get('prompt', '').strip()
        budget = data.get('budget', 'budget').lower()

        if not prompt:
            logger.warning("Empty prompt received")
            return jsonify({'error': 'Prompt is required'}), 400
        if len(prompt) > 10000:
            logger.warning("Input prompt too long")
            return jsonify({'error': 'Input prompt is too long (max 10,000 characters)'}), 400

        sanitized_prompt = html.escape(prompt)
        # Ajuster les seuils de budget selon tes données
        budget_filters = {
            'budget': {'$lte': 30000000},  # Basé sur 30M (Toy Story)
            'mid-range': {'$gt': 30000000, '$lte': 50000000},  # Entre 30M et 50M
            'premium': {'$gt': 50000000}  # Au-dessus de 50M
        }
        price_filter = budget_filters.get(budget, {'$lte': 30000000})

        # Vérifier les documents avant la recherche
        sample_doc = collection.find_one({"budget": price_filter})
        if not sample_doc:
            logger.warning(f"No documents found matching budget filter: {price_filter}. Falling back to all documents.")
            price_filter = {}

        try:
            logger.info(f"Generating embedding for prompt: {sanitized_prompt[:50]}...")
            embedding = genai.embed_content(model=EMBEDDING_MODEL, content=sanitized_prompt)['embedding']
            logger.info(f"Embedding generated, length: {len(embedding)}")
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return jsonify({'error': 'Failed to generate embedding'}), 500

        pipeline = [
            {
                '$vectorSearch': {
                    'index': 'vector_index',
                    'path': 'plot_embeddings',
                    'queryVector': embedding,
                    'numCandidates': 100,
                    'limit': 5
                }
            },
            {'$match': {'budget': price_filter}},
            {
                '$project': {
                    'title': 1,
                    'genres': 1,
                    'budget': 1,
                    '_id': 0,
                    'score': {'$meta': 'vectorSearchScore'}
                }
            }
        ]

        logger.info(f"Executing MongoDB pipeline: {pipeline}")
        results = list(collection.aggregate(pipeline))
        logger.info(f"Found {len(results)} raw results from MongoDB")

        # Mapper budget à price (0, 50, 200) selon les nouveaux seuils
        def map_budget_to_price(budget_value):
            if budget_value <= 30000000:
                return 0
            elif 30000000 < budget_value <= 50000000:
                return 50
            else:
                return 200

        recommendations = [
            {
                'name': doc['title'],
                'price': map_budget_to_price(doc.get('budget', 0)),
                'genres': doc.get('genres', []) if isinstance(doc.get('genres'), list) else doc.get('genres', '').split('|'),
                'score': min(doc.get('score', 0) * 100, 100)
            } for doc in results
        ]

        if not results:
            logger.warning(f"No recommendations found for prompt: {sanitized_prompt[:50]}..., budget: {budget}")
            return jsonify({'recommendations': [], 'warning': 'No movies matched the prompt or budget'}), 200

        logger.info(f"Returning {len(recommendations)} recommendations for prompt: {sanitized_prompt[:50]}...")
        return jsonify({'recommendations': recommendations}), 200

    except Exception as e:
        logger.error(f"Error in movie recommendation: {e}")
        return jsonify({'error': 'Internal server error'}), 500

if __name__ == "__main__":
    app.run(debug=True)

INFO:__main__:MongoDB connected successfully
INFO:__main__:Gemini API configured successfully


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


SystemExit: 1

In [5]:
import pandas as pd

ratings_df = pd.read_csv(r"C:\Users\timot\Documents\GitHub\datacraft\datasets\ratings.csv")
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Charger le dataset
df = pd.read_csv("datasets/Telco-Customer-Churn.csv")

# Nettoyage des données numériques
df = df[df['TotalCharges'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
df['TotalCharges'] = df['TotalCharges'].astype(float)

# Sélection des colonnes utiles
selected_cols = [
    "MonthlyCharges", "tenure", "TotalCharges", "Contract",
    "InternetService", "OnlineSecurity", "TechSupport",
    "StreamingTV", "PaperlessBilling", "PaymentMethod",
    "SeniorCitizen", "Partner", "Dependents"
]
data = df[selected_cols + ["Churn"]].copy()

# Encodage binaire
binary_cols = ["OnlineSecurity", "TechSupport", "StreamingTV", "PaperlessBilling", "Partner", "Dependents"]
for col in binary_cols:
    data[col] = data[col].map({"Yes": 1, "No": 0})

# Encodage One-Hot pour variables catégorielles multi-modalités
data = pd.get_dummies(data, columns=["Contract", "InternetService", "PaymentMethod"], drop_first=True)

# Cible
data["Churn"] = data["Churn"].map({"No": 0, "Yes": 1})

# Split features/target
X = data.drop("Churn", axis=1)
y = data["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train)
model = search.best_estimator_

# Évaluation
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Sauvegarde du modèle
model_path = "models_trains/random_forest_model.pkl"
joblib.dump(model, model_path)

print(round(accuracy * 100, 2))


77.4


In [53]:
data["Churn"].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [None]:
selected_cols = [
    "MonthlyCharges", "tenure", "TotalCharges", "Contract",
    "InternetService", "OnlineSecurity", "TechSupport",
    "StreamingTV", "PaperlessBilling", "PaymentMethod",
    "SeniorCitizen", "Partner", "Dependents"
]

# Garder et encoder
data = df[selected_cols + ["Churn"]].copy()

# Encodage binaire
binary_cols = ["OnlineSecurity", "TechSupport", "StreamingTV", "PaperlessBilling", "Partner", "Dependents"]
for col in binary_cols:
    data[col] = data[col].map({"Yes": 1, "No": 0})

# Contract, InternetService, PaymentMethod → One-Hot Encoding
data = pd.get_dummies(data, columns=["Contract", "InternetService", "PaymentMethod"], drop_first=True)

# Target
data["Churn"] = data["Churn"].map({"No": 0, "Yes": 1})


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Charger les données
ratings_df = pd.read_csv('datasets/ratings.csv')
movies_df = pd.read_csv('movies.csv')

# Filtrer ratings_df pour ne garder que les movieId dans la plage de 1 à 999
ratings_df = ratings_df[ratings_df['movieId'].between(1, 999)]

# Préparer la matrice utilisateur-produit
n_users = ratings_df['userId'].max()
n_items = 999  # Max movieId after filtering

# Créer une matrice sparse
user_item_matrix = csr_matrix((ratings_df['rating'],
                              (ratings_df['userId'] - 1, ratings_df['movieId'] - 1)),
                              shape=(n_users, n_items))

# Diviser les données pour validation
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)
train_matrix = csr_matrix((train_data['rating'],
                          (train_data['userId'] - 1, train_data['movieId'] - 1)),
                          shape=(n_users, n_items))
test_matrix = csr_matrix((test_data['rating'],
                         (test_data['userId'] - 1, test_data['movieId'] - 1)),
                         shape=(n_users, n_items))

# Entraîner le modèle SVD
n_components = 20  # Reduced from 50 to lower memory usage
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd.fit(train_matrix)

# Transformer et prédire
test_transformed = svd.transform(test_matrix)
test_pred = svd.inverse_transform(test_transformed)

# Évaluer l'exactitude sans conversion complète en dense
# Use non-zero elements directly
row_indices, col_indices = test_matrix.nonzero()
test_ratings = test_matrix[row_indices, col_indices].A1  # .A1 flattens to 1D array
test_pred_masked = test_pred[row_indices, col_indices]
rmse = np.sqrt(mean_squared_error(test_ratings, test_pred_masked))
print(f"RMSE sur les données de test : {rmse}")

# Sauvegarder le modèle et la matrice
joblib.dump(svd, '../models_trains/recommendation_model.pkl')
joblib.dump(user_item_matrix, '../models_trains/user_item_matrix.pkl')
print("Modèle et matrice sauvegardés avec succès.")

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/ratings.csv'

# Price Optimization

In [6]:

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import joblib

In [7]:
df = pd.read_csv(r'C:\Users\timot\PycharmProjects\datacraft_expert\datasets\retail_price.csv')

In [8]:
df['month_year'] = pd.to_datetime(df['month_year'], format='%m-%d-%Y')
df = df.dropna()  # Handle any NaN values
df['revenue'] = df['unit_price'] * df['qty']

In [4]:
# Features for training
features = ['unit_price', 'qty', 'product_weight_g', 'product_score', 'comp_1', 'lag_price']
X = df[features]
y = df['revenue']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [11]:
# Initialize and train the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model (optional for persistence)
joblib.dump(model, 'price_optimizer.joblib')


['price_optimizer.joblib']

In [1]:
import secrets
print(secrets.token_hex(16))  # Generates a 32-character hexadecimal string

e49dc9b13db364b24478d18a6e463800


In [90]:
import pymysql
import random
from datetime import datetime, timedelta
from faker import Faker

# Initialize Faker for realistic data
fake = Faker()

# Database connection
try:
    conn = pymysql.connect(
        host="localhost",
        user="root",
        password="22205731",  # Replace with your actual MySQL root password
        database="clinic",
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )
    cursor = conn.cursor()
except pymysql.Error as err:
    print(f"Connection failed: {err}")
    print("Ensure the MySQL server is running, the database 'clinic' exists, and the password is correct.")
    exit(1)


In [91]:

# Helper functions
def random_date(start_year, end_year):
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    delta = end - start
    return (start + timedelta(days=random.randint(0, delta.days))).strftime('%Y-%m-%d')

def random_time():
    return f"{random.randint(8, 17):02d}:{random.choice([0, 15, 30, 45]):02d}:00"

# Data pools
roles = ['Dentist', 'Nurse', 'Employee']
specialties = ['Orthodontics', 'Endodontics', 'Periodontics', 'Prosthodontics', 'Oral Surgery']
genders = ['M', 'F', 'Other']
treatment_types = ['Medical', 'Operational']
treatment_descs = ['Root Canal', 'Cleaning', 'Filling', 'Extraction', 'Crown']
medicine_names = ['Ibuprofen', 'Amoxicillin', 'Paracetamol', 'Clindamycin']
statuses = ['Scheduled', 'Completed', 'Cancelled']

# Insert Staff (50 records)
staff_data = []
for i in range(50):
    role = random.choices(roles, weights=[0.8, 0.1, 0.1])[0]  # 80% Dentists to ensure ~40
    staff_data.append((
        fake.first_name(),
        fake.last_name(),
        role,
        fake.phone_number()[:12],  # Limit phone length
        f"user{i}@clinic.com",
        random_date(2015, 2024)
    ))
try:
    cursor.executemany("""
        INSERT INTO Staff (FirstName, LastName, Role, Phone, Email, HireDate)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, staff_data)
    conn.commit()
    print("Inserted 50 Staff records")
except pymysql.Error as err:
    print(f"Error inserting Staff: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Dentists (40 records)
cursor.execute("SELECT StaffID FROM Staff WHERE Role = 'Dentist' LIMIT 40")
dentist_ids = [row['StaffID'] for row in cursor.fetchall()]
if len(dentist_ids) < 40:
    print(f"Warning: Only {len(dentist_ids)} Dentists found, expected 40")
dentists = [(id_, random.choice(specialties)) for id_ in dentist_ids]
try:
    cursor.executemany("INSERT INTO Dentist (DentistID, Specialty) VALUES (%s, %s)", dentists)
    conn.commit()
    print(f"Inserted {len(dentists)} Dentist records")
except pymysql.Error as err:
    print(f"Error inserting Dentist: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Patients (7,500 records)
patient_data = []
for i in range(7500):
    patient_data.append((
        fake.first_name(),
        fake.last_name(),
        random_date(1960, 2010),
        random.choice(genders),
        fake.phone_number()[:12],
        f"patient{i}@email.com",
        fake.street_address()
    ))
try:
    cursor.executemany("""
        INSERT INTO Patient (FirstName, LastName, DOB, Gender, Phone, Email, Address)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """, patient_data)
    conn.commit()
    print("Inserted 7,500 Patient records")
except pymysql.Error as err:
    print(f"Error inserting Patient: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Appointments (30,000 records)
cursor.execute("SELECT PatientID FROM Patient")
patient_ids = [row['PatientID'] for row in cursor.fetchall()]
appointment_data = []
for _ in range(30000):
    appointment_data.append((
        random.choice(patient_ids),
        random.choice(dentist_ids),
        random_date(2020, 2025),
        random_time(),
        random.choice(statuses)
    ))
try:
    cursor.executemany("""
        INSERT INTO Appointment (PatientID, DentistID, AppointmentDate, AppointmentTime, Status)
        VALUES (%s, %s, %s, %s, %s)
    """, appointment_data)
    conn.commit()
    print("Inserted 30,000 Appointment records")
except pymysql.Error as err:
    print(f"Error inserting Appointment: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Treatments (25,000 records)
cursor.execute("SELECT AppointmentID FROM Appointment")
appointment_ids = [row['AppointmentID'] for row in cursor.fetchall()]
treatment_data = []
for _ in range(25000):
    treatment_data.append((
        random.choice(appointment_ids),
        random.choice(treatment_types),
        random.choice(treatment_descs),
        round(random.uniform(100.0, 1000.0), 2)
    ))
try:
    cursor.executemany("""
        INSERT INTO Treatment (AppointmentID, TreatmentType, Description, Cost)
        VALUES (%s, %s, %s, %s)
    """, treatment_data)
    conn.commit()
    print("Inserted 25,000 Treatment records")
except pymysql.Error as err:
    print(f"Error inserting Treatment: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Medicines (75 records)
medicine_data = []
for i in range(75):
    medicine_data.append((
        f"{random.choice(medicine_names)}_{i}",
        "Dental use",
        random.randint(10, 200)
    ))
try:
    cursor.executemany("INSERT INTO Medicine (Name, Description, StockQuantity) VALUES (%s, %s, %s)", medicine_data)
    conn.commit()
    print("Inserted 75 Medicine records")
except pymysql.Error as err:
    print(f"Error inserting Medicine: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Prescriptions (20,000 records)
cursor.execute("SELECT TreatmentID FROM Treatment")
treatment_ids = [row['TreatmentID'] for row in cursor.fetchall()]
cursor.execute("SELECT MedicineID FROM Medicine")
medicine_ids = [row['MedicineID'] for row in cursor.fetchall()]
prescription_data = []
for _ in range(20000):
    prescription_data.append((
        random.choice(treatment_ids),
        random.choice(medicine_ids),
        f"{random.randint(100, 500)}mg",
        f"{random.randint(3, 14)} days"
    ))
try:
    cursor.executemany("""
        INSERT INTO Prescription (TreatmentID, MedicineID, Dosage, Duration)
        VALUES (%s, %s, %s, %s)
    """, prescription_data)
    conn.commit()
    print("Inserted 20,000 Prescription records")
except pymysql.Error as err:
    print(f"Error inserting Prescription: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Insert Payments (22,000 records)
payment_data = []
for treatment_id in random.sample(treatment_ids, 22000):  # Use random.sample to select 22,000 unique treatments
    cursor.execute("""
        SELECT a.PatientID FROM Appointment a
        JOIN Treatment t ON a.AppointmentID = t.AppointmentID
        WHERE t.TreatmentID = %s LIMIT 1
    """, (treatment_id,))
    res = cursor.fetchone()
    if res:
        payment_data.append((
            treatment_id,
            res['PatientID'],
            round(random.uniform(100.0, 1000.0), 2),
            random_date(2020, 2025)
        ))
try:
    cursor.executemany("""
        INSERT INTO Payment (TreatmentID, PatientID, Amount, PaymentDate)
        VALUES (%s, %s, %s, %s)
    """, payment_data)
    conn.commit()
    print("Inserted 22,000 Payment records")
except pymysql.Error as err:
    print(f"Error inserting Payment: {err}")
    conn.rollback()
    conn.close()
    exit(1)

# Close connection
conn.close()
print("Database population completed successfully!")



Inserted 50 Staff records
Inserted 40 Dentist records
Inserted 7,500 Patient records
Inserted 30,000 Appointment records
Inserted 25,000 Treatment records
Inserted 75 Medicine records
Inserted 20,000 Prescription records
Inserted 22,000 Payment records
Database population completed successfully!


In [None]:
config.json , model.safetensors , tokenizer.json ,vocab.txt