In [7]:
import pandas as pd
import numpy as np

In [8]:
url = 'https://drive.google.com/uc?id=1aMFZtWCdD468dj80-VpZhIE32sm6J0oO'
df = pd.read_csv(url)

In [13]:
df.drop(columns=['User ID'],inplace=True)
df.head()


Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:3],df.iloc[:,-1],test_size=0.2)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [16]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(drop='first',sparse_output=False),['Gender']),
    ('tnf2',MinMaxScaler(),['Age', 'EstimatedSalary'])
], remainder='passthrough')

In [17]:
X_train_transformer = transformer.fit_transform(X_train)
X_test_transformer = transformer.fit_transform(X_test)

In [18]:
procceesed_df = pd.DataFrame(X_train_transformer,columns=X_train.columns)
procceesed_df.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,0.0,0.928571,0.133333
1,0.0,0.833333,0.140741
2,1.0,0.285714,0.533333
3,1.0,0.166667,0.481481
4,0.0,0.452381,0.481481


In [19]:
procceesed_df.describe()

Unnamed: 0,Gender,Age,EstimatedSalary
count,320.0,320.0,320.0
mean,0.48125,0.471205,0.403102
std,0.500431,0.253737,0.254876
min,0.0,0.0,0.0
25%,0.0,0.285714,0.207407
50%,0.0,0.452381,0.403704
75%,1.0,0.666667,0.535185
max,1.0,1.0,1.0


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train_transformer, y_train)

# Make predictions
y_pred = model.predict(X_test_transformer)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.94


In [21]:
from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[50  3]
 [ 2 25]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95        53
           1       0.89      0.93      0.91        27

    accuracy                           0.94        80
   macro avg       0.93      0.93      0.93        80
weighted avg       0.94      0.94      0.94        80



In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=5)
grid_search.fit(X_train_transformer, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 10, 'n_estimators': 100}


 Save Transformer for Deployment

In [None]:
import joblib

joblib.dump(transformer, "transformer.pkl")


In [None]:
import joblib
joblib.dump(model, "purchase_prediction_model.pkl")
