<a href="https://colab.research.google.com/github/Patrick5455/Customer-Churn-Prediction/blob/master/modelling/modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import some libraries
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams 
import seaborn as sns
import os
sns.set_style('darkgrid')
rcParams['figure.figsize'] = 8,8
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 

> Data Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE

> > Project packages

In [None]:
# from project_package.data_package import data
# from project_package.model_package import model, model_metrics
# from project_package.plot_package import bi_plot, uni_plot
# 

In [None]:
# ?KNNImputer

> Load Datasets

In [None]:
#colab
train = pd.read_csv('/content/AIMS_Train.csv')
test=  pd.read_csv('/content/AIMS_Test.csv')
submission = pd.read_csv('/content/AIMS_SampleSubmission.csv')
variables = pd.read_csv('/content/VariableDefinitions.csv')

In [None]:
#import data - jupyter localhost

# train = pd.read_csv('../datasets/AIMS_Train.csv')
# test=  pd.read_csv('../datasets/AIMS_Test.csv')
# submission = pd.read_csv('../datasets/AIMS_SampleSubmission.csv')
# variables = pd.read_csv('../datasets/VariableDefinitions.csv')

In [None]:
variables

In [None]:
train.drop(columns=['REGION', 'MRG', 'TOP_PACK'], inplace=True) 
train = train.sample(frac=.3, random_state=123)
train.head()

In [None]:
train.tail()

In [None]:
train.shape

In [None]:
#view the data types in the train data
train.info()

In [None]:
test.drop(columns=['REGION', 'MRG', 'TOP_PACK'], inplace=True) 
test = test.sample(frac=.3, random_state=123)
test.head()

In [None]:
test.tail()

In [None]:
test.shape

In [None]:
test.info()

In [None]:
submission.head()

In [None]:
submission.shape

#### Data Wrangling

> rename columns to match english names

In [None]:
train.columns

> Calcualte umber of leels in categorical variables

In [None]:
def check_levels(data, exclude_cols=[]):
    cat_cols = data.select_dtypes(include='object').columns.tolist()
    
    for col in cat_cols:
        if col not in exclude_cols:
            print('*-*'*10,'\n')
            print(data[col].value_counts())
            plt.figure(figsize=(10,5))
            plt.title(f"Value Counts of {col}", fontsize=15)
            data[col].value_counts(normalize=True).plot(kind='bar')
            print('*-*'*10,'\n')
            plt.ylabel('counts')
            plt.xlabel(col)

In [None]:
check_levels(train, exclude_cols=['user_id'])

Compute statistics for numerical variables

In [None]:
train.describe() 

> Check for null values

In [None]:
def check_null(data, plot=True):
    print(data.isnull().sum())
    if plot:
        plt.figure(figsize=(10,5))
        plt.title(f"Null Values Count", fontsize=15)
        data.isnull().sum().plot.bar()

In [None]:
check_null(train)

In [None]:
check_null(test)

> There are a lot of null values in the dataset

> Check for outliers

In [None]:
def check_outliers(data, show_plot=False, save_img=os.getcwd()+'/outliers.png'):
 
        """
        This functions checks for columns with outlers using the IQR method

        It accespts as argmuent a dataset. 
        show_plot can be set to True to output pairplots of outlier columns    
        """

        outliers = [] 
        Q1 = data.quantile(0.25)  
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        num_data = data.select_dtypes(include='number')
        result = dict ((((num_data < (Q1 - 1.5 * IQR)) | (num_data > (Q3 + 1.5 * IQR)))==True).any())
        #data[(data[col] >= high)|(data[col] <= low)].index
        index = data[(num_data < Q1 - 1.5 * IQR) | (num_data > Q3 + 1.5 * IQR)].index
        for k,v in result.items():
            if v == True:  
                outliers.append(k)
        if show_plot:
            outlier_pair_plot = sns.pairplot(data[outliers]);
            print(f'{result},\n\n Visualization of outlier columns')
            plt.savefig(fname=save_img, format='png')
            return  outlier_pair_plot
        else:
            return data.loc[index, outliers] 

outliers in train set

In [None]:
train_outliers = check_outliers(train, show_plot=False)

In [None]:
train_outliers

> most columns in the train set contain outlier values

outliers in test set

In [None]:
test_outliers = check_outliers(test, show_plot=False) 

In [None]:
test_outliers

> check for duplicate values

In [None]:
train.duplicated().any()

In [None]:
test.duplicated().any()

> check for class imbalance

In [None]:
print("Class Imbalance\n")
print(train.CHURN.value_counts(), "\n")
train.CHURN.value_counts().plot.bar();

> no null values 

##### Data Cleaning

- Treat outliers with Median Imputation

- Treat nul vlaues in numerical featurees with KNN, and categorical features with mode

- Use Pipeline

> Median Imputation for Outlier values

In [None]:
def treat_numeric_outlier(data):
    for col in data.columns.tolist():
        if is_numeric_dtype(data[col]) == True: 
            median = data[col].quantile(0.50)
            q1 = data[col].quantile(0.95)
            q3 = data[col].quantile(0.05)
            iqr = q3 - q1
            high = int(q3 + 1.5 * iqr) 
            low = int(q1 - 1.5 * iqr)
            data[col] = np.where(data[col] > high, median, data[col])
            data[col] = np.where(data[col] > high, median, data[col]) 
    return data

In [None]:
train = treat_numeric_outlier(train)

In [None]:
check_outliers(train)

no more outlier values

In [None]:
train

> Create piepleinf to clean preprocess numeric and catgegorical vlaues

In [None]:
cat_features = train.select_dtypes(include='object').drop('user_id', axis=1).columns.tolist()
cat_features

In [None]:
num_features = train.select_dtypes(include=['int64', 'float64']).drop('CHURN', axis=1).columns.tolist()
num_features

In [None]:
# ?LabelEncoder

In [None]:
# cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
#                               ('encode', OneHotEncoder())])

num_transform= Pipeline(steps=[('scaler',RobustScaler()),
                               ('imputer', KNNImputer(copy=False))])


data_transform = ColumnTransformer(transformers=[#('cat_transform', cat_transform,cat_features),
                                                ('num_transform', num_transform, num_features)]
                                  , remainder='passthrough', verbose=True)

preprocess = Pipeline(steps=[('transform', data_transform)], verbose=True)

In [None]:
train

In [None]:
preprocess.fit(train)

#### Modelling

- Build Pipeline

In [None]:
# ('smote', SMOTE(random_state=123)

In [None]:
# def fill_na(data):
#     for col in data.columns.tolist():
#         if data[col].isna().any():
#             data[col].fillna()
            

In [None]:
def treat_outlier(data):
    from sklearn.neighbors import LocalOutlierFactor
    iso = IsolationForest(contamination='auto')
    yhat = iso.fit_predict(data.select_dtypes(exclude='object'),) 
    print(yhat) 
    #select all rows that are not outliers
    mask = yhat != -1 
    data = data[mask] 

In [None]:
# treat_outlier(train) 