In [24]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit

In [None]:
fuel= pd.read_csv('master_data/Rifornimenti_Carburante_ClubQ8.csv', sep=';')
fuel.sort_values(by=['COD_PAN_DA_POS'], inplace=True)
fuel['DATA_OPERAZIONE'] = fuel['DATA_OPERAZIONE'].astype('datetime64')
#fuel['DATA_OPERAZIONE'] = fuel['DATA_OPERAZIONE'].dt.date
fuel['DATA_OPERAZIONE'] = fuel['DATA_OPERAZIONE'].dt.to_period('M')
#fuel.set_index(['DATA_OPERAZIONE','COD_PAN_DA_POS'], drop=True, append=False, inplace=False, verify_integrity=False)
fuel.head(10)

In [None]:
#fuel.set_index(['DATA_OPERAZIONE','COD_PAN_DA_POS'], inplace = True)
#fuel.index

In [None]:
#Accumulated points by customer 
fuel['Accumulated Points']= fuel.groupby(['COD_PAN_DA_POS'])['PUNTI_CARBURANTE'].cumsum(axis = 0) 
fuel.head(30)

In [None]:
mfuel = fuel.groupby(['COD_PAN_DA_POS', 'DATA_OPERAZIONE'])['PUNTI_CARBURANTE'].sum()
mfuel.head(30)

In [None]:
premi = pd.read_csv('master_data/Premi_ClubQ8.csv', sep=';', decimal = ',')
premi.sort_values(by=['COD_PAN_DA_POS'], inplace=True)
#premi['PUNTI_RICHIESTI'].astype('float64')
premi['DATA_OPERAZIONE'] = premi['DATA_OPERAZIONE'].astype('datetime64')
premi['DATA_OPERAZIONE'] = premi['DATA_OPERAZIONE'].dt.to_period('M')
premi.head(10)

acc_points = premi.merge(mfuel, how='inner', on='COD_PAN_DA_POS')
df_points = acc_points.iloc[:,[0,1,8]]

df_points['Points'] = df_points['COD_PAN_DA_POS'] + df_points['DATA_OPERAZIONE']
df_points['Points'] = df_points.groupby(['COD_PAN_DA_POS', 'DATA_OPERAZIONE'])['PUNTI_CARBURANTE'].transform('sum')
#see : https://stackoverflow.com/questions/34099684/how-to-use-groupby-transform-across-multiple-columns

df_points = df_points.drop('PUNTI_CARBURANTE', 1)


df_points = df_points.rename(columns={'COD_PAN_DA_POS': 'CustNumber', 'DATA_OPERAZIONE': 'Date'})

df_points['Points2'] = (df_points['Points']**2)

df_points.drop_duplicates(subset=['Date', 'CustNumber'], keep='first', inplace=True)
df_points = df_points.sort_values(by=['CustNumber', 'Date'], ascending = [True, True])

df_points['PointsSpent'] = (df_points['Points'] - df_points['Points'].shift()).fillna(0)

#Check all the rows with a duplicated customer number 
#It shows that there is an evolution of the number of points and thus good for the probit/logit
regular_cust = df_points[df_points['CustNumber'].duplicated() == True]
print(len(regular_cust))
regular_cust.head(100)

In [None]:
#Creating the target variable by taking the assumption that an increase of points from one month to the next one
# (from t-1 to t) means increasing fuel purchase
df_points['IncreaseFuel'] = np.where(df_points['PointsSpent'] < 0, 0, 1)

#Logistic regression using SkLearn

features_cols = ['Points', 'Points2', 'PointsSpent']
X = df_points[features_cols]

y = df_points['IncreaseFuel']

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
logit = LogisticRegression()
logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)


#Checking the validity of the model
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.title('Logit - Confusion matrix', fontsize=20)
plt.ylabel('Actual IncreaseFuel')
plt.xlabel('Predicted IncreaseFuel')
#plt.tight_layout()
plt.savefig('figures/cnf_matrix_logit.png')
plt.show()

#Metrics

accuracy, precision, recall = metrics.accuracy_score(y_test, y_pred), metrics.precision_score(y_test, y_pred), metrics.recall_score(y_test, y_pred)
print('Accuracy : {}%'.format(accuracy*100))
print('Precision: {}%'.format(precision*100))
print('Recall: {}%'.format(recall*100))

In [None]:
#Receiver Operating Characteristic (ROC) curve

y_pred_proba = logit.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="Data set, auc="+str(auc))
plt.title('ROC curve of logistic regression', fontsize=20)
plt.legend()
plt.savefig('figures/roc_logit.png')
plt.show()

#Source : https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

In [None]:
sns.regplot(x='PointsSpent', y='IncreaseFuel', data=df_points, logistic=True, ci=None)
plt.title('Logistic Regression Curve', fontsize=20)
plt.savefig('figures/regplot_logit.png')
plt.show()