# **Modelo 1**

- Regressão Logística
- Hiperparâmetros selecionados
- Utilização do Min Max Scaler, Standard Scaler e Power Transform para mudanças nas escalas dos dados
- Pipeline completo

---

### **1. Importações**

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer

import joblib

In [2]:
base = pd.read_csv("../data/train.csv")
teste = pd.read_csv("../data/test.csv")

print(base.shape)
print(teste.shape)

(2000, 21)
(1000, 21)


### **2. Normalizando/padronizando as colunas**

```python
standard_cols = ['battery_power', 'mobile_wt', 'px_height','px_width','ram']
min_max_cols = ['fc', 'int_memory','pc','sc_h','talk_time']
sem_preprocessamento = []

for col in xtrain.columns:
    if col in standard_cols:
        pass
    elif col in min_max_cols:
        pass
    else:
        sem_preprocessamento.append(col)
```

### **3. Criando o modelo de Regressão Logística e o pipeline completo**

In [3]:
x = base.drop('price_range',axis=1)
y = base.price_range

xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.1, random_state=0, stratify=y)

In [4]:
print(ytrain.value_counts())
print(ytest.value_counts())

price_range
1    450
0    450
2    450
3    450
Name: count, dtype: int64
price_range
1    50
2    50
3    50
0    50
Name: count, dtype: int64


In [5]:
xtrain.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1052,547,0,1.9,1,1,0,37,0.4,154,5,4,371,541,2705,17,3,10,1,1,0
1025,1081,1,2.5,1,13,0,49,1.0,99,3,17,284,519,325,9,1,6,0,0,1
719,1068,0,2.0,1,1,1,37,0.9,184,7,13,186,1998,1803,5,4,7,1,0,0
58,1757,0,0.5,0,8,0,49,0.5,180,6,14,265,713,2056,7,5,4,0,0,0
528,1671,0,0.6,1,7,1,61,0.1,129,2,11,263,848,2336,10,4,7,1,0,0


In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

scores = []

reg_log = LogisticRegression(
                             solver='lbfgs',
                             penalty='l2',
                             C=1,
                             max_iter=10000,
                             random_state=0)

standard_cols = ['battery_power', 'mobile_wt', 'px_height','px_width','ram']
min_max_cols = ['fc', 'int_memory','pc','sc_h','talk_time']
sem_preprocessamento = []

for col in xtrain.columns:
    if col in standard_cols:
        pass
    elif col in min_max_cols:
        pass
    else:
        sem_preprocessamento.append(col)

preprocessamento = ColumnTransformer(
    transformers=[
        ('std_scaler', StandardScaler(), standard_cols),
        ('minmax_scaler', MinMaxScaler(), min_max_cols),
        ('none', 'passthrough', sem_preprocessamento)  # Passar colunas sem transformação
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessamento),
    ('logreg', reg_log)
])

scores = cross_val_score(pipeline, xtrain, ytrain, cv=skf, scoring='accuracy')


print("Pontuações por fold:", scores)
print("Acurácia média:", round(scores.mean(),4))


Pontuações por fold: [0.96388889 0.96111111 0.96388889 0.96944444 0.95555556]
Acurácia média: 0.9628


In [7]:
standard_cols = ['battery_power', 'mobile_wt', 'px_height','px_width','ram']
min_max_cols = ['fc', 'int_memory','pc','sc_h','talk_time']

preprocessamento = ColumnTransformer(
    transformers=[
        ('std_scaler', StandardScaler(), standard_cols),
        ('minmax_scaler', MinMaxScaler(), min_max_cols),
        ('none', 'passthrough', sem_preprocessamento)  # Passar colunas sem transformação
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessamento),
    ('logreg', reg_log)
])

pipeline.fit(xtrain, ytrain)

### **4. Realizando a previsão do xtest e avaliando as métricas**

In [8]:
previsoes = pipeline.predict(xtest)

metrics.confusion_matrix(ytest, previsoes)

array([[49,  1,  0,  0],
       [ 2, 46,  2,  0],
       [ 0,  1, 49,  0],
       [ 0,  0,  1, 49]])

In [9]:
print("Acurácia:",metrics.accuracy_score(ytest, previsoes))
print("Precisão:",metrics.precision_score(ytest, previsoes, average='macro'))
print("Recall:",metrics.recall_score(ytest, previsoes, average='macro'))

Acurácia: 0.965
Precisão: 0.965356334841629
Recall: 0.965


- Motivo de utilizar o `average='macro'`:

    - Calcula a métrica para cada classe separadamente e tira a média aritmética (não ponderada) das métricas.
    - Como funciona: Trata todas as classes igualmente, independentemente do tamanho.
    - Útil para: Avaliar modelos em cenários onde o desempenho em todas as classes é igualmente importante.

### **5. Visualizando as probabilidades do modelo escolher as classes**

In [21]:
# Visualizando a probabilidade das classes para cada celular

with pd.option_context('display.float_format', '{:.3f}'.format):
    probabilidades = pipeline.predict_proba(teste)
    probabilidades_df = pd.DataFrame(probabilidades)
    display(probabilidades_df.head(20)) 

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.399,0.601
1,0.0,0.0,0.056,0.944
2,0.0,0.004,0.62,0.376
3,0.0,0.0,0.0,1.0
4,0.01,0.873,0.117,0.0
5,0.0,0.0,0.052,0.948
6,0.0,0.0,0.0,1.0
7,0.02,0.953,0.027,0.0
8,0.0,0.0,0.103,0.897
9,0.992,0.008,0.0,0.0


### **6. Visualizando os parâmetros do pipeline**

In [23]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('std_scaler', StandardScaler(),
                                    ['battery_power', 'mobile_wt', 'px_height',
                                     'px_width', 'ram']),
                                   ('minmax_scaler', MinMaxScaler(),
                                    ['fc', 'int_memory', 'pc', 'sc_h',
                                     'talk_time']),
                                   ('none', 'passthrough',
                                    ['blue', 'clock_speed', 'dual_sim', 'four_g',
                                     'm_dep', 'n_cores', 'sc_w', 'three_g',
                                     'touch_screen', 'wifi'])])),
  ('logreg', LogisticRegression(C=1, max_iter=10000, random_state=0))],
 'transform_input': None,
 'verbose': False,
 'preprocessing': ColumnTransformer(transformers=[('std_scaler', StandardScaler(),
                                  ['battery_power', 'mobile_wt', 'px_heig

### **7. Aplicando na base de teste (test.csv)**

In [10]:
teste.head(3)

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1


In [11]:
previsao_base_teste = pipeline.predict(teste.drop("id",axis=1))
previsao_base_teste

array([3, 3, 2, 3, 1, 3, 3, 1, 3, 0, 3, 3, 0, 0, 2, 0, 2, 1, 3, 2, 1, 3,
       1, 1, 3, 0, 2, 0, 3, 0, 2, 0, 3, 0, 1, 1, 3, 1, 2, 1, 1, 2, 0, 0,
       0, 1, 0, 3, 1, 2, 1, 0, 3, 0, 3, 0, 3, 1, 1, 3, 3, 3, 0, 1, 1, 1,
       2, 3, 1, 2, 1, 2, 2, 3, 3, 0, 2, 0, 2, 3, 0, 3, 3, 0, 3, 0, 3, 1,
       3, 0, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 0, 0, 3, 0, 2, 0, 1, 2, 3,
       3, 3, 1, 3, 3, 3, 3, 2, 3, 0, 0, 3, 2, 1, 2, 0, 3, 2, 2, 2, 0, 2,
       2, 1, 3, 1, 1, 0, 3, 2, 1, 2, 1, 3, 2, 3, 3, 3, 2, 3, 2, 3, 1, 0,
       3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 1, 0, 3, 0, 0, 0, 2, 1, 0, 1,
       0, 0, 1, 2, 1, 0, 0, 1, 1, 2, 2, 1, 0, 0, 0, 1, 0, 3, 1, 0, 2, 2,
       3, 3, 1, 2, 3, 2, 3, 2, 2, 1, 0, 0, 1, 2, 0, 2, 3, 3, 0, 2, 0, 3,
       2, 3, 3, 1, 0, 1, 0, 3, 0, 1, 0, 2, 2, 1, 3, 1, 3, 0, 3, 1, 2, 0,
       0, 2, 1, 3, 3, 3, 1, 1, 3, 0, 0, 2, 3, 3, 1, 3, 1, 1, 3, 2, 1, 2,
       3, 3, 3, 1, 0, 0, 2, 3, 1, 1, 3, 2, 0, 3, 0, 0, 3, 1, 0, 3, 2, 3,
       3, 2, 1, 3, 3, 2, 3, 1, 2, 1, 2, 0, 2, 3, 1,

In [12]:
df_previsoes = pd.DataFrame({'id':range(1,1001),
                             'price_range':previsao_base_teste})

df_previsoes

Unnamed: 0,id,price_range
0,1,3
1,2,3
2,3,2
3,4,3
4,5,1
...,...,...
995,996,2
996,997,1
997,998,0
998,999,2


In [13]:
# Salvando para que possamos comparar os resultados
df_previsoes.to_csv("../data/previsoes_modelo_regressao_logistica.csv", index=False)

In [14]:
# Exportando o pipeline
joblib.dump(pipeline, '../models/pipeline_modelo_1.joblib')

['../models/pipeline_modelo_1.joblib']