# Random Forest Supervised Learning

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer

In [99]:
X, y = load_breast_cancer(return_X_y = True)

In [66]:

X_train,X_test, y_train,y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

In [67]:
rf = RandomForestClassifier(
    n_estimators = 200,
    max_depth = None,
    max_features = 'sqrt',
    oob_score = True,
    random_state = 42
)
rf.fit(X_train, y_train)

In [68]:
y_pred= rf.predict(X_test)

In [69]:
print("Acuracy:",accuracy_score(y_test,y_pred))
print("OOB Score: ",rf.oob_score_)
print(classification_report(y_test,y_pred))

Acuracy: 0.9649122807017544
OOB Score:  0.9604395604395605
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



Random Forest with pipeline


In [70]:
import pandas as pd

In [71]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler, Normalizer,OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

df=pd.read_csv('plan_purchase.csv')
print(df.head(5))

   Age  MonthlyIncome  PlanType  UsageScore Purchase
0   56          81476  Standard          90      Yes
1   46          64811  Standard          92      Yes
2   32          56208     Basic          71      Yes
3   25          40150   Premium          82      Yes
4   38          63286  Standard          34       No


In [72]:
df.isnull().sum()

Age              0
MonthlyIncome    0
PlanType         0
UsageScore       0
Purchase         0
dtype: int64

In [73]:
X=df.drop("Purchase",axis=1)
y=df['Purchase'].map({"No":0,"Yes":1})


In [74]:
categorical_features=X.select_dtypes(include='object').columns
numeric_features=X.select_dtypes(exclude='object').columns
print("Categorical Features:",list(categorical_features))
print("Numerical Features:",list(numeric_features))

Categorical Features: ['PlanType']
Numerical Features: ['Age', 'MonthlyIncome', 'UsageScore']


In [75]:
numerical_pipe=Pipeline([
    ('impute',SimpleImputer(strategy='median'))
])

In [76]:
categorical_pipe=Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
    
])

In [77]:
preprocessor = ColumnTransformer([
    ('num',numerical_pipe, numeric_features),
    ('cat', categorical_pipe, categorical_features)
])

In [78]:
pipeline = Pipeline([
    ('preprocessing',preprocessor),
    ('model',RandomForestClassifier(
        n_estimators = 100,
        random_state = 42
    ))
])

In [79]:
print('Columns: ',df.columns.tolist())
print('\nInfo :')
print(df.info())
print('\nDescribe:')
display(df.describe())
df.shape

Columns:  ['Age', 'MonthlyIncome', 'PlanType', 'UsageScore', 'Purchase']

Info :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            500 non-null    int64 
 1   MonthlyIncome  500 non-null    int64 
 2   PlanType       500 non-null    object
 3   UsageScore     500 non-null    int64 
 4   Purchase       500 non-null    object
dtypes: int64(3), object(2)
memory usage: 19.7+ KB
None

Describe:


Unnamed: 0,Age,MonthlyIncome,UsageScore
count,500.0,500.0,500.0
mean,39.326,52753.62,60.082
std,12.200386,20181.171598,19.938967
min,18.0,20055.0,0.0
25%,29.0,35309.5,46.0
50%,41.0,52286.0,61.0
75%,50.0,70364.25,75.0
max,59.0,89896.0,100.0


(500, 5)

In [80]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y)

print("Train class distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest class distribution:")
print(y_test.value_counts(normalize=True))

Train class distribution:
Purchase
0    0.562857
1    0.437143
Name: proportion, dtype: float64

Test class distribution:
Purchase
0    0.566667
1    0.433333
Name: proportion, dtype: float64


In [81]:
pipeline.fit(X_train,y_train)

In [82]:
y_pred=pipeline.predict(X_test)

print('Accuracy:',accuracy_score(y_test,y_pred))
print('Report:\n',classification_report(y_test,y_pred))

Accuracy: 0.9866666666666667
Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        85
           1       1.00      0.97      0.98        65

    accuracy                           0.99       150
   macro avg       0.99      0.98      0.99       150
weighted avg       0.99      0.99      0.99       150



In [95]:
new_customer = pd.DataFrame({
    "Age": [30],
    "MonthlyIncome" : [55000],
    "PlanType" : ["Premium"],
    "UsageScore": [65]
})
prediction = pipeline.predict(new_customer)
probability = pipeline.predict_proba(new_customer)

result = "Yes" if prediction[0] == 1 else "No"

print("Purchase Prediction Result")
print("-" *30)
print(f"Predicted Purchase           : {result}")
print(f"Probability of Purchase           : {probability[0][1]:.2%}")

Purchase Prediction Result
------------------------------
Predicted Purchase           : Yes
Probability of Purchase           : 100.00%


In [96]:
import joblib
joblib.dump(pipeline,'telecom_pipeline.pkl')

loaded=joblib.load('telecom_pipeline.pkl')
redictions=loaded.predict(new_customer)

In [97]:
joblib.dump(pipeline.named_steps["model"],"model.pkl")

['model.pkl']