In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from collections import defaultdict

# Intro

**This project has 4 steps:**
<ol>
    <li>Over view and Splitting is where I split the data into train, valid and test </li>
    <li>Models is where I build the models </li>
    <li>Check on Test is where I check the best model on the test set </li>
    <li>Summay all of my inputs </li>
</ol>

# Over view and Splitting

In [2]:
df1 = pd.read_csv('https://code.s3.yandex.net/datasets/users_behavior.csv')
#df1 = pd.read_csv('users_behavior.csv')

In [3]:
df1.sample(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
3066,44.0,250.07,37.0,0.0,1
2142,68.0,403.58,28.0,19210.63,0
1819,125.0,897.35,0.0,38910.01,1
1115,64.0,495.2,21.0,20048.69,0
636,92.0,628.47,72.0,16839.52,0
169,76.0,633.01,22.0,17689.05,0
2321,76.0,528.62,61.0,20397.68,0
1776,52.0,388.37,46.0,20741.66,0
1813,25.0,180.63,20.0,6854.94,0
1612,42.0,298.9,45.0,20665.09,0


In [4]:
df1.shape

(3214, 5)

Splitting to Train and Test

In [5]:
df, df_test = train_test_split(df1, test_size=0.20, random_state=42)

In [6]:
X_test = df_test[df_test.columns.difference(['is_ultra'])]
y_test = df_test['is_ultra']

Splitting to Train and Validation

In [7]:
df_train, df_valid = train_test_split(df, test_size=0.20)

In [8]:
X = df[df.columns.difference(['is_ultra'])]
y = df['is_ultra']

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.20, random_state=42)

# Models

Logistic

I will try a few combos of the logistic model:
1) regular model

2) l1 penalty

3) lbfgs solver

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)
predict = model.predict(X_valid)
regular_logistic_acc = model.score(X_valid, y_valid)
regular_logistic_acc

0.7495145631067961

In [15]:
model = LogisticRegression(penalty='l1')
model.fit(X_train, y_train)
predict = model.predict(X_valid)
logistic_l1 = model.score(X_valid, y_valid)
logistic_l1

0.7475728155339806

In [16]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
predict = model.predict(X_valid)
logistic_diff_solver = model.score(X_valid, y_valid)
logistic_diff_solver

0.7165048543689321

Decision Tree

In [17]:
def build_dt_model(depth, x_train, y_train):
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(x_train,y_train)
    return model

In [18]:
dt_results = defaultdict(list)
for depth in range(1,5):
    model = build_dt_model(depth, X_train, y_train)
    acc_train = model.score(X_train, y_train)
    acc_valid = model.score(X_valid, y_valid)
    dt_results['depth'].append(depth)
    dt_results['acc_train'].append(acc_train)
    dt_results['acc_valid'].append(acc_valid)

In [19]:
pd.DataFrame(dt_results).sort_values(by='acc_valid', ascending = False)

Unnamed: 0,depth,acc_train,acc_valid
2,3,0.794747,0.798058
3,4,0.800097,0.796117
1,2,0.778696,0.776699
0,1,0.748541,0.737864


Random Forest

In [20]:
def build_rf_model(estim, depth, x_train, y_train):
    model = RandomForestClassifier(n_estimators=estim, max_depth=depth)
    model.fit(x_train, y_train)
    return model

In [21]:
rf_results = defaultdict(list)
for depth in range(1,5):
    for estim in range(1,11):
        model = build_rf_model(estim, depth, X_train, y_train)
        acc_train = model.score(X_train,y_train)
        acc_valid = model.score(X_valid,y_valid)
        rf_results['depth'].append(depth)
        rf_results['estim'].append(estim)
        rf_results['acc_train'].append(acc_train)
        rf_results['acc_valid'].append(acc_valid)

In [22]:
pd.DataFrame(rf_results).sort_values(by='acc_valid', ascending = False).head()

Unnamed: 0,depth,estim,acc_train,acc_valid
34,4,5,0.802529,0.805825
28,3,9,0.794747,0.805825
37,4,8,0.802043,0.805825
22,3,3,0.796206,0.803883
31,4,2,0.800584,0.8


DT Regression

In [23]:
def build_dtr_model(depth, x_train, y_train):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(x_train,y_train)
    return model

In [24]:
dtr_results = defaultdict(list)
for depth in range(1,5):
    model = build_dtr_model(depth, X_train, y_train)
    acc_train = model.score(X_train,y_train)
    acc_valid = model.score(X_valid,y_valid)
    dtr_results['depth'].append(depth)
    dtr_results['acc_train'].append(acc_valid)
    dtr_results['acc_valid'].append(acc_valid)

In [25]:
pd.DataFrame(dtr_results).sort_values(by='acc_valid', ascending = False).head()

Unnamed: 0,depth,acc_train,acc_valid
2,3,0.25192,0.25192
3,4,0.233417,0.233417
1,2,0.201961,0.201961
0,1,0.086602,0.086602


RF Regression

In [26]:
def build_rfr_model(estim, depth, x_train, y_train):
    model = RandomForestRegressor(n_estimators=estim, max_depth=depth)
    model.fit(x_train, y_train)
    return model

In [27]:
rfr_results = defaultdict(list)
for depth in range(1,5):
    for estim in range(1,11):
        model = build_rfr_model(estim, depth, X_train, y_train)
        acc_train = model.score(X_train,y_train)
        acc_valid = model.score(X_valid,y_valid)
        rfr_results['depth'].append(depth)
        rfr_results['estim'].append(estim)
        rfr_results['acc_train'].append(acc_train)
        rfr_results['acc_valid'].append(acc_valid)

In [28]:
pd.DataFrame(rfr_results).sort_values(by='acc_valid', ascending = False).head()

Unnamed: 0,depth,estim,acc_train,acc_valid
31,4,2,0.297961,0.299636
35,4,6,0.323537,0.292875
36,4,7,0.332578,0.290728
37,4,8,0.323003,0.284133
39,4,10,0.334429,0.279771


# Check on Test

Since the highest ACC was from Random Forest, will use this model on the test set.

In [29]:
model = build_rf_model(6,4, X_train, y_train)

In [30]:
model.score(X_test, y_test)

0.7869362363919129

# Summary

In this project I check the models:
* [ ] Logistic with a few changes
* [ ] Decision Tree and Regression tree
* [ ] Random Forest and Regression Forest

Since we are in a classification problem the tree regression are useless, I just checked to see the difference.<br>
Highest ACC with a small margin was random forest model

While checking the model on the test set I got roughly the same ACC.

To Conclude:
Best model was Random forest with 6 trees and depth of 4.