In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("prepared_data.csv")

In [3]:
data.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
0,0,Yes,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,Not Churned
1,0,No,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,1889.5,Not Churned
2,0,No,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,108.15,Churn
3,0,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),1840.75,Not Churned
4,0,No,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,151.65,Churn


In [4]:
features = data.drop(['Churn','TotalCharges'],axis=1)
label = data['Churn']

columns = features.columns

for column in columns:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column])


In [5]:
x_train,x_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=47)

In [6]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

predictions = clf.predict(x_test)

accuracy = accuracy_score(y_test,predictions)

In [7]:
accuracy

0.7256574271499645

In [8]:
training_prediction = clf.predict(x_train)

accuracy_score(y_train,training_prediction)

0.8791111111111111

In [9]:
for column in x_train.columns:
    print(x_train[column].value_counts())

SeniorCitizen
0    4714
1     911
Name: count, dtype: int64
Partner
0    2890
1    2735
Name: count, dtype: int64
Dependents
0    3948
1    1677
Name: count, dtype: int64
InternetService
1    2496
0    1911
2    1218
Name: count, dtype: int64
OnlineSecurity
0    2800
2    1607
1    1218
Name: count, dtype: int64
OnlineBackup
0    2473
2    1934
1    1218
Name: count, dtype: int64
DeviceProtection
0    2472
2    1935
1    1218
Name: count, dtype: int64
TechSupport
0    2788
2    1619
1    1218
Name: count, dtype: int64
StreamingTV
0    2245
2    2162
1    1218
Name: count, dtype: int64
StreamingMovies
0    2206
2    2201
1    1218
Name: count, dtype: int64
Contract
0    3095
2    1357
1    1173
Name: count, dtype: int64
PaperlessBilling
1    3340
0    2285
Name: count, dtype: int64
PaymentMethod
2    1881
3    1299
0    1238
1    1207
Name: count, dtype: int64


In [10]:
for column in x_test.columns:
    print(x_test[column].value_counts())

SeniorCitizen
0    1176
1     231
Name: count, dtype: int64
Partner
0    749
1    658
Name: count, dtype: int64
Dependents
0    985
1    422
Name: count, dtype: int64
InternetService
1    600
0    505
2    302
Name: count, dtype: int64
OnlineSecurity
0    697
2    408
1    302
Name: count, dtype: int64
OnlineBackup
0    614
2    491
1    302
Name: count, dtype: int64
DeviceProtection
0    622
2    483
1    302
Name: count, dtype: int64
TechSupport
0    684
2    421
1    302
Name: count, dtype: int64
StreamingTV
0    564
2    541
1    302
Name: count, dtype: int64
StreamingMovies
0    575
2    530
1    302
Name: count, dtype: int64
Contract
0    780
2    328
1    299
Name: count, dtype: int64
PaperlessBilling
1    828
0    579
Name: count, dtype: int64
PaymentMethod
2    484
1    314
3    305
0    304
Name: count, dtype: int64


In [11]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None,5, 8 , 10, 20, 30, 40, 50],
    'min_samples_split': [1, 2, 5, 10], 
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
}

In [12]:
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train,y_train)

In [13]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_


In [14]:
y_pred = best_model.predict(x_test)

accuracy_score(y_test,y_pred)

In [15]:
train_pred = best_model.predict(x_train)

accuracy_score(y_train,train_pred)