In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('/datasets/users_behavior.csv')

In [4]:
print(df.shape)
print(df.head())
print(df.info())

(3214, 5)
   calls  minutes  messages   mb_used  is_ultra
0   40.0   311.90      83.0  19915.42         0
1   85.0   516.75      56.0  22696.96         0
2   77.0   467.66      86.0  21060.45         0
3  106.0   745.53      81.0   8437.39         1
4   66.0   418.74       1.0  14502.75         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB
None


In [5]:
features = df.drop("is_ultra", axis=1)
target = df["is_ultra"]


In [6]:
features_train, features_temp, target_train, target_temp = train_test_split(
    features, target, test_size=0.4, random_state=12345)

features_valid, features_test, target_valid, target_test = train_test_split(
    features_temp, target_temp, test_size=0.5, random_state=12345)



print("Tamanho treino:", len(features_train))
print("Tamanho validação:", len(features_valid))
print("Tamanho teste:", len(features_test))

Tamanho treino: 1928
Tamanho validação: 643
Tamanho teste: 643


In [7]:

best_model = None
best_result = 0

for est in [50, 100, 150]:
    for depth in range(5, 21, 5):
        model = RandomForestClassifier(random_state=12345,
                                       n_estimators=est,
                                       max_depth=depth)
        model.fit(features_train, target_train)
        predictions = model.predict(features_valid)
        result = accuracy_score(target_valid, predictions)
        print(f"n_estimators={est}, max_depth={depth}, Accuracy={result:.3f}")

        if result > best_result:
            best_model = model
            best_result = result

print("\nMelhor acurácia na validação:", best_result)

final_predictions = best_model.predict(features_test)
final_result = accuracy_score(target_test, final_predictions)
print("Acurácia no teste:", final_result)


n_estimators=50, max_depth=5, Accuracy=0.793
n_estimators=50, max_depth=10, Accuracy=0.793
n_estimators=50, max_depth=15, Accuracy=0.784
n_estimators=50, max_depth=20, Accuracy=0.785
n_estimators=100, max_depth=5, Accuracy=0.795
n_estimators=100, max_depth=10, Accuracy=0.795
n_estimators=100, max_depth=15, Accuracy=0.788
n_estimators=100, max_depth=20, Accuracy=0.788
n_estimators=150, max_depth=5, Accuracy=0.793
n_estimators=150, max_depth=10, Accuracy=0.795
n_estimators=150, max_depth=15, Accuracy=0.787
n_estimators=150, max_depth=20, Accuracy=0.787

Melhor acurácia na validação: 0.7947122861586314
Acurácia no teste: 0.7900466562986003


In [8]:
best_dt_result = 0
best_dt_model = None

for depth in range(1, 21):
    dt_model = DecisionTreeClassifier(random_state=12345, max_depth=depth)
    dt_model.fit(features_train, target_train)
    predictions_dt = dt_model.predict(features_valid)
    result_dt = accuracy_score(target_valid, predictions_dt)
    print(f"[DecisionTree] max_depth={depth}, Accuracy={result_dt:.3f}")

    if result_dt > best_dt_result:
        best_dt_result = result_dt
        best_dt_model = dt_model

print("\nMelhor acurácia na validação (Decision Tree):", best_dt_result)

final_predictions_dt = best_dt_model.predict(features_test)
final_result_dt = accuracy_score(target_test, final_predictions_dt)
print("Acurácia no teste (Decision Tree):", final_result_dt)

[DecisionTree] max_depth=1, Accuracy=0.754
[DecisionTree] max_depth=2, Accuracy=0.782
[DecisionTree] max_depth=3, Accuracy=0.785
[DecisionTree] max_depth=4, Accuracy=0.779
[DecisionTree] max_depth=5, Accuracy=0.779
[DecisionTree] max_depth=6, Accuracy=0.784
[DecisionTree] max_depth=7, Accuracy=0.782
[DecisionTree] max_depth=8, Accuracy=0.779
[DecisionTree] max_depth=9, Accuracy=0.782
[DecisionTree] max_depth=10, Accuracy=0.774
[DecisionTree] max_depth=11, Accuracy=0.762
[DecisionTree] max_depth=12, Accuracy=0.762
[DecisionTree] max_depth=13, Accuracy=0.756
[DecisionTree] max_depth=14, Accuracy=0.759
[DecisionTree] max_depth=15, Accuracy=0.747
[DecisionTree] max_depth=16, Accuracy=0.734
[DecisionTree] max_depth=17, Accuracy=0.736
[DecisionTree] max_depth=18, Accuracy=0.731
[DecisionTree] max_depth=19, Accuracy=0.728
[DecisionTree] max_depth=20, Accuracy=0.722

Melhor acurácia na validação (Decision Tree): 0.7853810264385692
Acurácia no teste (Decision Tree): 0.7791601866251944


Tarefa adicional: tirar a prova real do modelo. Esses dados são mais complexos do que os que você está acostumado a trabalhar, então não será uma tarefa fácil. Vamos dar uma olhada mais de perto mais tarde.

In [11]:
scores = cross_val_score(best_model, features, target, cv=5, scoring='accuracy')
print("Acurácias por fold:", scores)
print("Média cross-validation:", scores.mean())

Acurácias por fold: [0.8118196  0.78849145 0.80404355 0.78382582 0.80841121]
Média cross-validation: 0.7993183238615719


Neste projeto, investigamos modelos de classificação para prever o plano correto 

-  A Random Forest obteve melhor desempenho no teste

- A Decision Tree apresentou acurácia menor que Random Forest

- Portanto, a Random Forest é o modelo mais adequado para este caso, superando o limite de 0.75 de acurácia exigido pelo projeto