In [None]:
#importando as bibliotecas
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#lendo o arquivo de ações
#Lendo csv

df = pd.read_csv("dados.csv", delimiter=';')
df

Unnamed: 0,customer_id,customer_acquisition_channel,year,week,net_revenue,gross_revenue,boxes
0,206461,Paid Marketing,2014,W09,71,71,2
1,462640,Paid Marketing,2015,W25,28,56,1
2,666461,Referral,2015,W50,40,40,1
3,183202,Referral,2013,W42,18,37,1
4,410993,Referral,2014,W29,0,37,1
...,...,...,...,...,...,...,...
715870,741230,Referral,2015,W51,0,40,1
715871,170503,Paid Marketing,2015,W25,9,37,1
715872,169910,Referral,2015,W04,37,37,1
715873,542301,Paid Marketing,2015,W11,56,56,1


In [None]:
#verificar o tipo do arquivo
df.dtypes

customer_id                      int64
customer_acquisition_channel    object
year                             int64
week                            object
net_revenue                      int64
gross_revenue                    int64
boxes                            int64
dtype: object

In [None]:
df.tail()

Unnamed: 0,customer_id,customer_acquisition_channel,year,week,net_revenue,gross_revenue,boxes
715870,741230,Referral,2015,W51,0,40,1
715871,170503,Paid Marketing,2015,W25,9,37,1
715872,169910,Referral,2015,W04,37,37,1
715873,542301,Paid Marketing,2015,W11,56,56,1
715874,725361,Paid Marketing,2015,W50,54,54,1


In [None]:
#retirando os dados nulos
df.dropna(inplace=True)
df

Unnamed: 0,customer_id,customer_acquisition_channel,year,week,net_revenue,gross_revenue,boxes
0,206461,Paid Marketing,2014,W09,71,71,2
1,462640,Paid Marketing,2015,W25,28,56,1
2,666461,Referral,2015,W50,40,40,1
3,183202,Referral,2013,W42,18,37,1
4,410993,Referral,2014,W29,0,37,1
...,...,...,...,...,...,...,...
715870,741230,Referral,2015,W51,0,40,1
715871,170503,Paid Marketing,2015,W25,9,37,1
715872,169910,Referral,2015,W04,37,37,1
715873,542301,Paid Marketing,2015,W11,56,56,1


In [5]:
#Criando variáveis numéricas
x_num = ['year', 'gross_revenue', 'boxes']
x_num

['year', 'gross_revenue', 'boxes']

In [6]:
#Criando variáveis categóricas
x_cat = ['customer_acquisition_channel', 'week']
x_cat

['customer_acquisition_channel', 'week']

In [None]:
# Processamento das variáveis categóricas week

#Substituindo a variável week para sequencial
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x['customer_acquisition_channel'] = le.fit_transform(df['customer_acquisition_channel']) 
x.head(10)

In [None]:
#verificando quantidade de linhas
qtd_linhas = len(df)

qtd_linhas_treino= round(.70 * qtd_linhas)
qtd_linhas_teste= qtd_linhas - qtd_linhas_treino  
qtd_linhas_validacao = qtd_linhas -1

info = (
    f"linhas treino= 0:{qtd_linhas_treino}"
    f" linhas teste= {qtd_linhas_treino}:{qtd_linhas_treino + qtd_linhas_teste -1}"
    f" linhas validação= {qtd_linhas_validacao}"
)

info


'linhas treino= 0:501112 linhas teste= 501112:715874 linhas validação= 715874'

In [None]:
#separando as features e labels
features = df.drop(['customer_id', 'customer_acquisition_channel', 'week'], 1)
labels = df['net_revenue']

In [None]:
#Escolhendo as melhores features com Kbest

features_list = ('year',	'gross_revenue',	'boxes')

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print ('')
print ("Melhores features:")
print (k_best_features_final)


Melhores features:
{'boxes': inf, 'gross_revenue': 706.2553627976323}


  f = msb / msw


In [None]:
#separando as features escolhidas
features = df.loc[:,['gross_revenue']]

In [None]:
features

Unnamed: 0,gross_revenue
0,71
1,56
2,40
3,37
4,37
...,...
715870,40
715871,37
715872,37
715873,56


In [None]:
#Separa os dados de treino teste e validação
X_train = features[:qtd_linhas_treino]
X_test = features[qtd_linhas_treino:qtd_linhas_treino + qtd_linhas_teste -1]

y_train = labels[:qtd_linhas_treino]
y_test = labels[qtd_linhas_treino:qtd_linhas_treino + qtd_linhas_teste -1]

print( len(X_train), len(y_train))

print( len(X_test), len(y_test))

501112 501112
214762 214762


In [None]:
# Normalizando os dados de entrada(features)

# Gerando o novo padrão
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)  # Normalizando os dados de entrada(treinamento)
X_test_scale  = scaler.transform(X_test)       # Normalizando os dados de entrada(teste)



In [None]:
#treinamento usando regressão linear
lr = linear_model.LinearRegression()
lr.fit(X_train_scale, y_train)
pred= lr.predict(X_test_scale)
cd =r2_score(y_test, pred)

f'Coeficiente de determinação:{cd * 100:.2f}'

'Coeficiente de determinação:48.25'

In [None]:
#rede neural
rn = MLPRegressor(max_iter=2000)

rn.fit(X_train_scale, y_train)
pred= rn.predict(X_test_scale)

cd = rn.score(X_test_scale, y_test)


f'Coeficiente de determinação:{cd * 100:.2f}'

'Coeficiente de determinação:48.61'

In [None]:
valor_novo = features.tail(1)
valor_novo

Unnamed: 0,gross_revenue
715874,54


In [None]:
#executando a previsão


previsao=scaler.transform(valor_novo)


pred=lr.predict(previsao)

pred

array([48.67285091])

In [None]:
week_full=df['week']
week=week_full.tail(1)

res_full=df['net_revenue']
res=res_full.tail(1)

df=pd.DataFrame({'week':week, 'real':res, 'previsao':pred})


df.set_index('week', inplace=True)

print(df)