# Modelling

## Importing the libraries

In [362]:
import numpy as np
import pandas as pd

## Importing the dataset

In [363]:
df = pd.read_csv("data_after_feature_engineering.csv")

In [364]:
# Transforming Boolean data
df['has_gas'] = df['has_gas'].replace([True, False], [1, 0])

In [365]:
X = df.drop(columns = ["id", "churn"])
y = df["churn"]

In [366]:
X.columns[X.isna().any()].tolist()

[]

In [367]:
X = X.iloc[:, 1:].values
y = y.iloc[:,].values

# Encoding categorical variable

In [368]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 18])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into the Training set and Test set

In [369]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature scaling

In [370]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

## Training the Random Forest Classifier on the Training set

In [371]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

## Predicting the Test set results

In [372]:
y_pred = classifier.predict(X_test)

## Making the confusion matrix

In [373]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[3283    5]
 [ 330   28]]


0.908118486012068

## Applying K-fold cross validation

In [374]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 90.78 %
Standard Deviation: 0.35 %


## Applying Grid search to find the best model and parameters

In [375]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [10, 50, 100, 1000], 'criterion': ['gini']},
              {'n_estimators': [10, 50, 100, 1000], 'criterion': ['entropy']}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 90.78 %
Best Parameters: {'criterion': 'entropy', 'n_estimators': 100}
