In [571]:
# imporatation des librairies 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [572]:
# Chargement des données d'entrainement
df = pd.read_csv("Telecom-Data-1.csv", sep=';')

In [573]:
# Affichage des lignes
df.head() 

Unnamed: 0,ID,Genre,Senior,Enfants,Anciennete,Multi-lignes,Service Internet,Autres Services,Partenaire,Contrat,Facturation electronique,Mode de paiement,charges mensuelles,Charges totales,Desabonnement
0,1,Femme,0,Non,1,Non,DSL,Non,Oui,Mensuel,Oui,Cheque electronique,2985,2985,Non
1,2,Homme,0,Non,34,Non,DSL,Oui,Non,Annuel,Non,Cheque papier,5695,18895,Non
2,3,Homme,0,Non,2,Non,DSL,Oui,Non,Mensuel,Oui,Cheque papier,5385,10815,Oui
3,4,Homme,0,Non,45,Non,DSL,Non,Non,Annuel,Non,Virement bancaire,423,184075,Non
4,5,Femme,0,Non,2,Non,Fibre optique,Oui,Non,Mensuel,Oui,Cheque electronique,707,15165,Oui


In [574]:
# Typages des colonnes 
df.dtypes

ID                           int64
Genre                       object
Senior                       int64
Enfants                     object
Anciennete                   int64
Multi-lignes                object
Service Internet            object
Autres Services             object
Partenaire                  object
Contrat                     object
Facturation electronique    object
Mode de paiement            object
charges mensuelles          object
Charges totales             object
Desabonnement               object
dtype: object

In [575]:
# Remplacemment des , par . pour un forçage numérique 
df['charges mensuelles'] = df['charges mensuelles'].str.replace(",", ".")
df['Charges totales'] = df['charges mensuelles'].str.replace(",", ".")

In [576]:
# Vérification 
df.head()

Unnamed: 0,ID,Genre,Senior,Enfants,Anciennete,Multi-lignes,Service Internet,Autres Services,Partenaire,Contrat,Facturation electronique,Mode de paiement,charges mensuelles,Charges totales,Desabonnement
0,1,Femme,0,Non,1,Non,DSL,Non,Oui,Mensuel,Oui,Cheque electronique,29.85,29.85,Non
1,2,Homme,0,Non,34,Non,DSL,Oui,Non,Annuel,Non,Cheque papier,56.95,56.95,Non
2,3,Homme,0,Non,2,Non,DSL,Oui,Non,Mensuel,Oui,Cheque papier,53.85,53.85,Oui
3,4,Homme,0,Non,45,Non,DSL,Non,Non,Annuel,Non,Virement bancaire,42.3,42.3,Non
4,5,Femme,0,Non,2,Non,Fibre optique,Oui,Non,Mensuel,Oui,Cheque electronique,70.7,70.7,Oui


In [577]:
# Forçage en float 
df['charges mensuelles']= df['charges mensuelles'].astype(float) 
df['Charges totales']= df['Charges totales'].astype(float)

In [578]:
df.dtypes

ID                            int64
Genre                        object
Senior                        int64
Enfants                      object
Anciennete                    int64
Multi-lignes                 object
Service Internet             object
Autres Services              object
Partenaire                   object
Contrat                      object
Facturation electronique     object
Mode de paiement             object
charges mensuelles          float64
Charges totales             float64
Desabonnement                object
dtype: object

In [579]:
# Fonction valeurs manquantes pour l'apprentissage 
def valeur_manquante(data):
    nb_rows=data.shape[0]
    ret=pd.DataFrame(data.isna().sum(),columns=['nb_manquant'])
    ret['pourcentage']=(ret['nb_manquant']/nb_rows)*100
    return ret

In [580]:
valeur_manquante(df)

Unnamed: 0,nb_manquant,pourcentage
ID,0,0.0
Genre,0,0.0
Senior,0,0.0
Enfants,0,0.0
Anciennete,0,0.0
Multi-lignes,0,0.0
Service Internet,0,0.0
Autres Services,0,0.0
Partenaire,0,0.0
Contrat,0,0.0


In [581]:
# Supressioon lignes ou il ya des valeurs manquentes 
df = df.dropna()

In [582]:
valeur_manquante(df)

Unnamed: 0,nb_manquant,pourcentage
ID,0,0.0
Genre,0,0.0
Senior,0,0.0
Enfants,0,0.0
Anciennete,0,0.0
Multi-lignes,0,0.0
Service Internet,0,0.0
Autres Services,0,0.0
Partenaire,0,0.0
Contrat,0,0.0


In [583]:
### create two dataframe on containt categorial variables  and other containt  numeric variable 
categorical_data = df[df.select_dtypes(['object']).columns]
numerical_data   = df[df.select_dtypes(['int64','float64']).columns]

In [584]:
def categorical_transform(data):
    encoder = LabelEncoder()

    # Tranform
    for categorical in list(categorical_data.columns) :
        data.loc[:,categorical] = encoder.fit_transform(data.loc[:,categorical])
        
    return data 

In [585]:
def num_transform(data):
    encoder = StandardScaler()
    data = np.array(data).reshape(-1, 1)
    # Tranform
    for num in list(numerical_data.columns) :
        data.loc[:,num] = encoder.fit_transform(data.loc[:,num])
        
    return data

In [586]:
# Fonction qui encode les valeurs catégorielles et numériques 
def encode_columns(data):
    encoder_cat = LabelEncoder()
    encoder_num = StandardScaler()
    try:
        data = data.drop('ID', axis=1)
    except:
        pass
    for column in data.columns:
        if data[column].dtype == 'object':
            data[column] = encoder_cat.fit_transform(data[column])
        else:
            data[column] = encoder_num.fit_transform(data[[column]])
            
    return data

In [587]:
data_encoder= encode_columns(df)
data_encoder

Unnamed: 0,Genre,Senior,Enfants,Anciennete,Multi-lignes,Service Internet,Autres Services,Partenaire,Contrat,Facturation electronique,Mode de paiement,charges mensuelles,Charges totales,Desabonnement
0,0,-0.440336,0,-1.278111,0,0,0,1,2,1,1,-1.158195,-1.158195,0
1,1,-0.440336,0,0.061420,0,0,1,0,0,0,2,-0.260482,-0.260482,0
2,1,-0.440336,0,-1.237519,0,0,1,0,2,1,2,-0.363172,-0.363172,1
3,1,-0.440336,0,0.507930,0,0,0,0,0,0,3,-0.745777,-0.745777,0
4,0,-0.440336,0,-1.237519,0,1,1,0,2,1,1,0.194999,0.194999,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,-0.440336,0,-0.547457,0,1,1,0,2,1,3,0.460007,0.460007,0
5982,0,-0.440336,0,1.603910,0,2,1,0,1,1,3,-1.446390,-1.446390,0
5983,1,-0.440336,1,-0.344498,1,0,1,1,0,1,2,0.662075,0.662075,0
5984,0,-0.440336,1,-0.872192,0,0,0,1,2,1,1,-1.166476,-1.166476,0


In [588]:
X =  data_encoder.drop(columns=['Desabonnement'])
y =  data_encoder[['Desabonnement']]



In [589]:
# Splitting the dataset into the Training and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [590]:
# Instantiate the classifier
clf = LogisticRegression()
  
# Fit to the training data
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [591]:
# Predict the labels for the test set
y_pred = clf.predict(X_test)

In [592]:
# Compute accuracy
from sklearn.metrics import accuracy_score
  
accuracy_score(y_test, y_pred)

0.8070175438596491

In [593]:
# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [594]:
#Defining the modelling function
def modeling(alg, alg_name, params={}):
    model = alg(**params) #Instantiating the algorithm class and unpacking parameters if any
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #Performance evaluation
    def print_scores(alg, y_true, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_true, y_pred)
        print("accuracy: ",acc_score)
        pre_score = precision_score(y_true, y_pred)
        print("precision: ",pre_score)
        rec_score = recall_score(y_true, y_pred)
        print("recall: ",rec_score)
        f_score = f1_score(y_true, y_pred, average='weighted')
        print("f1_score: ",f_score)

    print_scores(alg, y_test, y_pred)
    return model

# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')

Logistic Regression
accuracy:  0.8070175438596491
precision:  0.6694915254237288
recall:  0.5080385852090032
f1_score:  0.7977056454831702


  y = column_or_1d(y, warn=True)


In [595]:
# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfecv = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")

rfecv.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

KeyboardInterrupt: 

In [None]:
print("The optimal number of features: {}".format(rfecv.n_features_))

The optimal number of features: 10


In [None]:
X_rfe = X.iloc[:, rfecv.support_]
y_rfe = X.iloc[:, rfecv.support_]
#Overview of the optimal features in comparison with the intial dataframe
print("'X' dimension: {}".format(X.shape))
print("'X' column list:", X.columns.tolist())
print("X_rfe dimension: {}".format(X_rfe.shape))
print("X_rfe column list:", X_rfe.columns.tolist())

'X' dimension: (5985, 13)
'X' column list: ['Genre', 'Senior', 'Enfants', 'Anciennete', 'Multi-lignes', 'Service Internet', 'Autres Services', 'Partenaire', 'Contrat', 'Facturation electronique', 'Mode de paiement', 'charges mensuelles', 'Charges totales']
X_rfe dimension: (5985, 10)
X_rfe column list: ['Senior', 'Enfants', 'Anciennete', 'Multi-lignes', 'Service Internet', 'Autres Services', 'Contrat', 'Facturation electronique', 'charges mensuelles', 'Charges totales']


In [None]:

df_predict = pd.read_csv("Telecom-Evaluation.csv" ,sep=";")
data_encoder= data_encoder.drop('Desabonnement', axis=1)
df_predict = df_predict[list(data_encoder.columns)]

X_predict = encode_columns(df_predict)
print(X_predict.columns)
print(data_encoder.columns)
prediction = clf.predict(X_predict)












Index(['Genre', 'Senior', 'Enfants', 'Anciennete', 'Multi-lignes',
       'Service Internet', 'Autres Services', 'Partenaire', 'Contrat',
       'Facturation electronique', 'Mode de paiement', 'charges mensuelles',
       'Charges totales'],
      dtype='object')
Index(['Genre', 'Senior', 'Enfants', 'Anciennete', 'Multi-lignes',
       'Service Internet', 'Autres Services', 'Partenaire', 'Contrat',
       'Facturation electronique', 'Mode de paiement', 'charges mensuelles',
       'Charges totales'],
      dtype='object')


In [None]:
print(X_predict)

      Genre    Senior  Enfants  Anciennete  Multi-lignes  Service Internet  \
0         0 -0.436251        0   -0.902385             0                 0   
1         0 -0.436251        0   -0.155397             1                 1   
2         1 -0.436251        1    0.716090             0                 0   
3         1 -0.436251        1    0.633091             1                 1   
4         0 -0.436251        0   -1.234380             0                 1   
...     ...       ...      ...         ...           ...               ...   
1052      0 -0.436251        0   -0.943885             0                 0   
1053      1  2.292256        0    0.965086             1                 0   
1054      0 -0.436251        0   -0.819387             0                 0   
1055      0 -0.436251        1    1.670575             1                 1   
1056      1 -0.436251        0    1.421579             0                 1   

      Autres Services  Partenaire  Contrat  Facturation electro

In [None]:
prediction

array([1, 1, 1, ..., 1, 1, 1])

In [None]:

prediction_df = pd.DataFrame(prediction, columns=["Desabonnement_prediction"])
print(prediction_df)
prediction_df.to_csv("prediction.csv", index=False)

      Desabonnement_prediction
0                            1
1                            1
2                            1
3                            1
4                            1
...                        ...
1052                         1
1053                         1
1054                         1
1055                         1
1056                         1

[1057 rows x 1 columns]


In [None]:
## Improve best model by hyperparameter tuning
# define model
model = LogisticRegression()

# define evaluation
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
from scipy.stats import loguniform
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 1000)

# define search
from sklearn.model_selection import RandomizedSearchCV
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

# execute search
result = search.fit(X_rfe, y)
# summarize result
# print('Best Score: %s' % result.best_score_)
# print('Best Hyperparameters: %s' % result.best_params_)
params = result.best_params_

#Improving the Logistic Regression model
log_model = modeling(LogisticRegression, 'Logistic Regression Classification', params=params)