In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

## Train

In [3]:
X = pd.read_csv("X_train_final_duration.csv")

In [4]:
X.drop(['meals_saved.1', 'total_supply.1', 'item_view.1'], axis=1, inplace=True)
columns = []
for i in range(1,16):
    columns.append('meals_saved_lag_'+str(i))
    columns.append('total_supply_lag_'+str(i))
    columns.append('item_view_lag_'+str(i))

In [5]:
X.drop(columns, axis=1, inplace=True)

In [6]:
X_train = X.sample(n=300000, random_state=1)

In [7]:
stringency = pd.read_excel("stringency.xlsx")

In [8]:
a = X_train['target'] != 0
X_train['churn_duration'] = a.cumsum()-a.cumsum().where(~a).ffill().fillna(0).astype(int)

In [9]:
X_train = X_train[X_train['churn_duration']<=15]
X_train.drop(['churn_duration'], axis=1, inplace=True)
X_train = X_train.dropna()


stringency['date'] = pd.to_datetime(stringency['stringency'], format='%d%b%Y')
stringency.drop(columns=['stringency'], axis=1, inplace=True)
X_train['date'] = pd.to_datetime(X_train['date'])
X_train = X_train.merge(stringency,how='left', on='date')
X_train = X_train.fillna(0)


y = X_train['target']
X_train.drop(['target', 'date', 'store_id'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.25, random_state=42, shuffle=True)

In [10]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

In [11]:
rdf_clf = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=0, class_weight='balanced_subsample')
rdf_clf.fit(X_scaled, y_train)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=7,
                       n_estimators=500, random_state=0)

In [13]:
X_test_scaled = scaler.transform(X_test)

y_pred = rdf_clf.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred))
print('Recall :', recall_score(y_test, y_pred))
print('Precision :', precision_score(y_test, y_pred))
print('F1 score :', f1_score(y_test, y_pred))

[[14720  6600]
 [   86  3594]]
Recall : 0.9766304347826087
Precision : 0.35256032960565037
F1 score : 0.5180913939743405


## Test

In [81]:
test_features = pd.read_csv("test_features.csv")
test_features.shape

(334378, 34)

In [82]:
X_test = pd.read_csv("final_test.csv")

In [83]:
X_test.drop(['meals_saved.1', 'total_supply.1', 'item_view.1'], axis=1, inplace=True)

In [84]:
stringency = pd.read_excel("stringency.xlsx")

In [85]:
X_test.shape

(494515, 118)

In [86]:
s_item = X_test.item_view
s_total = X_test.total_supply
s_meals = X_test.meals_saved

for i in range(1,16):
    X_test['item_view_lag_' + str(i)].fillna(s_item, inplace=True)
    

for i in range(1,16):
    X_test['total_supply_lag_' + str(i)].fillna(s_total, inplace=True)
    
for i in range(1,16):
    X_test['meals_saved_lag_' + str(i)].fillna(s_meals, inplace=True)


In [87]:
X_test.isnull().sum()[X_test.isnull().sum()>0]

Series([], dtype: int64)

In [88]:
stringency['date'] = pd.to_datetime(stringency['stringency'], format='%d%b%Y')
stringency.drop(columns=['stringency'], axis=1, inplace=True)
X_test['date'] = pd.to_datetime(X_test['date'])
X_test = X_test.merge(stringency,how='left', on='date')
X_test = X_test.fillna(0)


to_keep = X_test[['date', 'store_id']]
X_test.drop(['date', 'store_id'], axis=1, inplace=True)

In [89]:
X_test_scaled = scaler.transform(X_test)

In [90]:
y_test_pred = rdf_clf.predict(X_test_scaled)
y_test_prob = rdf_clf.predict_proba(X_test_scaled)

In [91]:
y_test_final = pd.concat([to_keep,pd.DataFrame(y_test_pred, columns=['target']),pd.DataFrame(y_test_prob, columns=['bhdie','score'])], axis=1)


In [92]:
y_test_final

Unnamed: 0,date,store_id,label,bhdie,score
0,2020-04-29,75480.0,1,0.009027,0.990973
1,2020-04-30,75480.0,0,0.930725,0.069275
2,2020-05-01,75480.0,1,0.025599,0.974401
3,2020-04-28,75409.0,1,0.013179,0.986821
4,2020-04-29,75409.0,1,0.103443,0.896557
...,...,...,...,...,...
494510,2020-04-27,87.0,1,0.005755,0.994245
494511,2020-04-28,87.0,1,0.005525,0.994475
494512,2020-04-29,87.0,1,0.005571,0.994429
494513,2020-04-30,87.0,1,0.005363,0.994637


In [93]:
test_features['date'] = pd.to_datetime(test_features['date'])
y_test_final['date'] = pd.to_datetime(y_test_final['date'])
res = test_features.merge(y_test_final, how='left', on=['date','store_id'])[['index', 'target', 'score']]

In [94]:
res['score'] = np.round(res['score'],5)

In [143]:
def submit_random(url, password, version):
    df = res
    st = StringIO()
    df.to_csv(st, index=False, line_terminator="\n")

    data = {
      "name": "Coni",
      "format": "df",
      "team": "Equipe3",
      "project": "tgtg",
      "version": "16",
      "content": st.getvalue(),
      "password": "Ensae06CPdist"
    }

    response = requests.post(url, json=data, verify=False)
    return response

url = "https://51.159.6.59:8798/"
password = "Ensae06CPdist"

response = submit_random(url + "submit/", password, "8")
pprint.pprint(response.json())



{'date': '2021-04-10T13:31:54.875128',
 'format': 'df',
 'metadata': '{"client": ["37.166.54.119", 48510]}',
 'name': 'Coni',
 'project': 'tgtg',
 'team': 'Equipe3',
 'version': '16'}
