# Social Network Ads – Full ML Workflow

This notebook reproduces the complete workflow:
1. Load data
2. Preprocess (impute + scale)
3. Train & tune 5 models with GridSearchCV
4. Evaluate (Accuracy, Precision, Recall, F1, ROC-AUC)
5. Plots & scenario predictions
6. Hypothesis testing & conclusions

**Tip:** If anything fails, run the Python script instead: `python run_project.py`.


In [None]:
import os, sys, json, numpy as np, pandas as pd\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix\nfrom scipy.stats import pointbiserialr\nimport matplotlib.pyplot as plt\n\nDATA_PATH = 'Social_Network_Ads.csv'\ndf = pd.read_csv(DATA_PATH)\nX = df[['Age','EstimatedSalary']]\ny = df['Purchased'].astype(int)\nX_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)\npreproc = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])\nX_train_p = preproc.fit_transform(X_train); X_test_p = preproc.transform(X_test)\nmodels = {\n    'LogisticRegression': (LogisticRegression(max_iter=1000, random_state=42), {'C':[0.01,0.1,1,10]}),\n    'KNN': (KNeighborsClassifier(), {'n_neighbors':[3,5,7,9]}),\n    'SVM': (SVC(probability=True, random_state=42), {'C':[0.1,1,10],'kernel':['rbf','linear']}),\n    'DecisionTree': (DecisionTreeClassifier(random_state=42), {'max_depth':[None,3,5,7]}),\n    'RandomForest': (RandomForestClassifier(random_state=42), {'n_estimators':[50,100],'max_depth':[None,5,7]})\n}\nresults = []; best_models = {}\nfor name, (clf, grid) in models.items():\n    gs = GridSearchCV(clf, grid, cv=5, scoring='accuracy', n_jobs=-1)\n    gs.fit(X_train_p, y_train)\n    best = gs.best_estimator_\n    pipe = Pipeline([('preproc', preproc), ('clf', best)])\n    best_models[name] = pipe\n    y_pred = pipe.predict(X_test)\n    y_prob = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps['clf'],'predict_proba') else None\n    acc = accuracy_score(y_test, y_pred)\n    prec = precision_score(y_test, y_pred)\n    rec = recall_score(y_test, y_pred)\n    f1 = f1_score(y_test, y_pred)\n    roc = roc_auc_score(y_test, y_prob) if y_prob is not None else None\n    results.append({'Model':name,'Accuracy':acc,'Precision':prec,'Recall':rec,'F1':f1,'ROC_AUC':roc,'BestParams':gs.best_params_})\npd.DataFrame(results).sort_values('Accuracy', ascending=False)\n\n# Basic scatter\nplt.figure(figsize=(6,4))\nmask = df['Purchased']==1\nplt.scatter(df.loc[~mask,'Age'], df.loc[~mask,'EstimatedSalary'], alpha=0.7, label='Not Purchased')\nplt.scatter(df.loc[mask,'Age'], df.loc[mask,'EstimatedSalary'], marker='x', label='Purchased')\nplt.xlabel('Age'); plt.ylabel('EstimatedSalary'); plt.legend(); plt.title('Age vs EstimatedSalary')\nplt.grid(True); plt.show()\n\n# Pick best\nbest_name = sorted(results, key=lambda r: r['Accuracy'], reverse=True)[0]['Model']\nbest_pipe = best_models[best_name]\n\n# Scenario predictions\nmedian_salary = X_train['EstimatedSalary'].median()\ndef predict_case(age, salary):\n    sal = median_salary if salary is None else salary\n    arr = np.array([[age, sal]])\n    pred = best_pipe.predict(arr)[0]\n    prob = best_pipe.predict_proba(arr)[0][1] if hasattr(best_pipe.named_steps['clf'],'predict_proba') else None\n    return int(pred), (float(prob) if prob is not None else None), sal\ncases = [(30,87000),(40,None),(40,100000),(50,None),(18,None),(22,600000),(35,2500000),(60,100000000)]\npred_rows=[]\nfor a,s in cases:\n    p, pr, im = predict_case(a,s)\n    pred_rows.append({'Age':a,'OrigSalary':s,'ImputedSalary':im,'PredictedPurchased':p,'PurchaseProb':pr})\npd.DataFrame(pred_rows)\n\n# Hypothesis correlations\nfrom scipy.stats import pointbiserialr\ncorr_age = pointbiserialr(df['Purchased'], df['Age']).correlation\ncorr_sal = pointbiserialr(df['Purchased'], df['EstimatedSalary']).correlation\nprint(f'Correlation Age~Purchased: {corr_age:.4f} | Salary~Purchased: {corr_sal:.4f}')\n