# RANDOM FOREST

## 1.Load the dataset (iris.csv).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
from sklearn.datasets import load_iris

data = load_iris()

In [None]:
df1 = pd.DataFrame(data.data, columns=data.feature_names)

In [None]:
df1['target'] = data.target

In [None]:
df1.head()

## 2.Load the dataset (Churnprediction.csv).

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Pradnya1208/Telecom-Customer-Churn-prediction/main/data.csv', encoding='UTF-8')

In [None]:
df.head()

## 3.Drop columns that are not required for classification of Churn Risk.

In [None]:
# dropping customerID

df.drop('customerID',
        axis=1,
        inplace=True)

In [None]:
df.head()

## 4.If require perform data preprocessing.

In [None]:
df.info()

In [None]:
# checkin null/missing values
df.isnull().sum()

In [None]:
df.duplicated().sum() # 22 duplicated rows

In [None]:
# dropping duplicated rows

df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# changing data type from Object to float64

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

after converting there are missing values

In [None]:
df['TotalCharges'].isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
y = df['Churn']
df = df.drop(['Churn'], axis=1)

In [None]:
num_cols = df.columns[df.dtypes != 'O']
cat_cols = df.columns[df.dtypes == 'O']

num_cols, cat_cols

In [None]:
for i in num_cols:
    sns.displot(df[i], kde=True)

most of the features are right skewed

In [None]:
for i in cat_cols:
    print(df[i].value_counts())

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
# creating ml Pipeline

# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('scaler',StandardScaler())
    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('ordinal',OrdinalEncoder()),
    ('scaler', StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,num_cols),
('cat_pipeline',cat_pipeline,cat_cols)
])

## 5.Split dataset into test and train (20:80).

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## 6.Build any three classification models for identifying Churn Risk.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
clf1 = LogisticRegression(random_state=42)
clf2 = SVC(kernel='rbf', random_state=42)
clf3 = KNeighborsClassifier(n_neighbors=5)

In [None]:
df.head()

## 7.Build Voting ensemble classifier on the training dataset.

In [None]:
eclf1 = VotingClassifier(estimators=
                        [('lr', clf1),
                        ('svc', clf2),
                        ('knn', clf3)], voting='hard')

## 8.Build Bagging ensemble classifier on the training dataset.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=30)

## 9.Build Boosting ensemble classifier on the training dataset.

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

## 10.Fit the models designed from step-5 to step-8 on the test dataset.

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

### Voting

In [None]:
eclf1 = eclf1.fit(X_train, y_train)

### Bagging

In [None]:
rfc.fit(X_train, y_train)

### Boosting

In [None]:
xgb = xgb.fit(X_train, y_train)

## 11.Evaluate  the  designed  models  from  step-5  to  step-8  with  appropriate classification metrics.

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

### Voting

In [None]:
sns.heatmap(confusion_matrix(y_test, eclf1.predict(X_test)), annot=True, fmt='g')

In [None]:
# accuracy

accuracy_score(y_test, eclf1.predict(X_test)) * 100

### Bagging 

In [None]:
sns.heatmap(confusion_matrix(y_test, rfc.predict(X_test)), fmt='g', annot=True)

In [None]:
accuracy_score(y_test, rfc.predict(X_test)) * 100

### Boosting

In [None]:
sns.heatmap(confusion_matrix(y_test, xgb.predict(X_test)), fmt='g', annot=True)

In [None]:
accuracy_score(y_test, xgb.predict(X_test)) * 100