In [None]:
#!pip install --upgrade scikit-learn scikeras[tensorflow]
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
#!pip install scikeras[tensorflow]
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input,InputLayer
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.optimizers import Adam
# -------------------------------
# 1. Load and Preprocess Dataset
# -------------------------------
social_df = pd.read_csv("social_media_vs_productivity.csv")

# -------------------------------
# 1. Split Data FIRST to prevent leakage
# -------------------------------
X = social_df.drop(columns=['actual_productivity_score'])  # keep score aside temporarily
y_score = social_df['actual_productivity_score']

X_train_raw, X_test_raw, y_score_train, y_score_test = train_test_split(
    X, y_score, test_size=0.2, random_state=42
)

# Reattach for easier processing
train_df = X_train_raw.copy()
train_df['actual_productivity_score'] = y_score_train

test_df = X_test_raw.copy()
test_df['actual_productivity_score'] = y_score_test

# -------------------------------
# 2. Handle missing values (based on training data)
# -------------------------------
missing_cols = [
    'daily_social_media_time', 'perceived_productivity_score', 'actual_productivity_score',
    'stress_level', 'sleep_hours', 'screen_time_before_sleep', 'job_satisfaction_score'
]

# Fill train
for col in missing_cols:
    skewness = train_df[col].skew()
    if abs(skewness) < 0.5:
        train_df[col] = train_df[col].fillna(train_df[col].mean())
    else:
        train_df[col] = train_df[col].fillna(train_df[col].median())

# Fill test using train stats
for col in missing_cols:
    skewness = train_df[col].skew()
    if abs(skewness) < 0.5:
        test_df[col] = test_df[col].fillna(train_df[col].mean())
    else:
        test_df[col] = test_df[col].fillna(train_df[col].median())

# -------------------------------
# 3. Remove outliers from training set only
# -------------------------------

numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

Q1 = train_df[numeric_cols].quantile(0.25)
Q3 = train_df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

non_outliers_mask = ~((train_df[numeric_cols] < lower_bound) | (train_df[numeric_cols] > upper_bound)).any(axis=1)
train_df_clean = train_df[non_outliers_mask].reset_index(drop=True)

# -------------------------------
# 4. Categorize productivity
# -------------------------------
train_df_clean['productivity_category'], bins = pd.qcut(
    train_df_clean['actual_productivity_score'], q=3,
    labels=['Low', 'Medium', 'High'], retbins=True
)
# Apply same bins to test set
test_df['productivity_category'] = pd.cut(
    test_df['actual_productivity_score'],
    bins=bins,
    labels=['Low', 'Medium', 'High'],
    include_lowest=True
)
test_df_clean = test_df.dropna(subset=['productivity_category']).reset_index(drop=True)

# -------------------------------
# 5. Define features and target
# -------------------------------
X_train = train_df_clean.drop(columns=['actual_productivity_score', 'productivity_category'])
y_train = train_df_clean['productivity_category']

X_test = test_df_clean.drop(columns=['actual_productivity_score', 'productivity_category'])
y_test = test_df_clean['productivity_category']


# -------------------------------
# 7. Preprocessing pipeline
# -------------------------------
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])




# -------------------------------
# 8. Fit-transform train / transform test
# -------------------------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
# After fitting
scaler = preprocessor.named_transformers_['num']



le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)




# -------------------------------
# 9. Feature selection with SelectKBest
# -------------------------------

selector = SelectKBest(score_func=f_classif, k=5)
selector.fit(X_train_processed, y_train_enc)

X_train_selected = selector.transform(X_train_processed)
X_test_selected = selector.transform(X_test_processed)


num_features_names = numeric_features
cat_features_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
all_feature_names = num_features_names + cat_features_names

# Features sélectionnées
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [all_feature_names[i] for i in selected_feature_indices]



# -------------------------------
# 6. Build and Train Model
# -------------------------------
input_dim = X_train_selected.shape[1]
num_classes = len(np.unique(y_train))


def create_model(num_hidden_layers=2, units=32, learning_rate=0.001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    for _ in range(num_hidden_layers):
        model.add(Dense(units, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )
    return model

keras_model = KerasClassifier(
    model=create_model,
    epochs=30,
    batch_size=10,
    verbose=0
)

param_grid = {
    'model__num_hidden_layers': [3, 4, 5],
    'model__units': [32, 64],
    'model__learning_rate': [0.01, 0.001],
    'epochs': [20, 30],
    'batch_size': [10, 20]
}

grid = GridSearchCV(keras_model, param_grid, cv=3)
grid_result = grid.fit(X_train_selected, y_train)

print(f"Best score: {grid_result.best_score_} with params: {grid_result.best_params_}")