In [1]:
import pandas as pd
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


In [2]:
import numpy as np

In [3]:
# dataset = pd.read_csv('msc_data_cleaned.csv')
dataset = pd.read_csv('msc_data_cleaned_new1.csv')

In [4]:
def clean_age(x):
    if '51-60' in x or '60+' in x:
        return '51-60'
    if '41-45' in x or '46-50' in x:
        return '41-50'
    if '31-35' in x or '36-40' in x:
        return '31-40'
    if '21-25' in x or '26-30' in x:
        return '21-30'
    return 'Under 20'

dataset['age_cat'] = dataset['age_cat'].apply(clean_age)

In [5]:
dataset = dataset.dropna(subset=['device_type'])

In [6]:
def clean_deviceType(x):
    if 'BASIC' in x:
        return 'BASIC'
    if 'SMART' in x:
        return 'SMART'
    if 'FEATURE' in x:
        return 'FEATURE'

dataset['device_type'] = dataset['device_type'].apply(clean_deviceType)

In [7]:
sin= 2078.0
eng = 2002.0
# tam = 2079.0
def clean_language(x):
    if sin in [x]:
        return 'Sinhala'
    if eng in [x]:
        return 'English'
    return 'Tamil'

dataset['language_id'] = dataset['language_id'].apply(clean_language)

In [8]:
def clean_connectionAge(x):
    if 'more than 5 years' in x:
        return 'more than 5 years'
    if '3-5 years' in x:
        return '3-5 years'
    if '2-3 years' in x or '1-2 years' in x:
        return '1-3 years'
    return 'less than 1'

dataset['con_age'] = dataset['con_age'].apply(clean_connectionAge)

In [9]:
from sklearn.preprocessing import LabelEncoder
# labelEncoder = LabelEncoder()
dataTransform = dataset.copy()

In [10]:
le_language = LabelEncoder()
dataTransform['language_id'] = le_language.fit_transform(dataset['language_id'])
le_age = LabelEncoder()
dataTransform['age_cat'] = le_age.fit_transform(dataset['age_cat'])
le_conAge = LabelEncoder()
dataTransform['con_age'] = le_conAge.fit_transform(dataset['con_age'])
le_device = LabelEncoder()
dataTransform['device_type'] = le_device.fit_transform(dataset['device_type'])
le_package = LabelEncoder()
dataTransform['package'] = le_package.fit_transform(dataset['package'])
dataTransform["package"].unique()

array([2, 1, 0])

In [11]:
dataTransform.shape

(21781, 10)

In [12]:
# dataTransform.shape

In [13]:
dtf = dataTransform['package']
# Min-Max Normalization
df = dataTransform.drop('package', axis=1)
df_norm = (df-df.min())/(df.max()-df.min())

df_norm.head()

Unnamed: 0,gender,age_cat,con_age,language_id,total_voice_usage_min,total_data_usage_mb,device_type,total_revenue,vas_revenue
0,0.0,0.75,0.666667,0.0,0.00846,0.098454,1.0,0.019094,0.003322
2,0.0,0.25,0.0,0.5,0.015958,0.059166,1.0,0.019646,0.002089
3,1.0,0.5,0.666667,0.0,0.014263,2e-06,1.0,0.003819,0.00225
4,1.0,0.5,0.0,0.0,0.001694,0.165021,1.0,0.026097,0.00278
6,0.0,0.75,0.666667,0.0,0.091588,0.0,0.5,0.017776,0.001853


In [14]:
def outliers(df,feature):
    Q1 = df[feature].quantile(0.05)
    Q3 = df[feature].quantile(0.95)
    IQR = Q3 - Q1
    
    lowerBound = Q1 - 1.5*IQR
    uperBound  = Q3 + 1.5*IQR
    
    ls = df.index[(df[feature]<lowerBound) | (df[feature]>uperBound)]
    return ls

In [15]:
index_list = []
for f in ['total_voice_usage_min','total_data_usage_mb','total_revenue']:
    index_list.extend(outliers(df_norm,f))

In [16]:
def remove(df,ls):
    ls = sorted(set(ls))
    df =df.drop(ls)
    return df

In [17]:
cleand_df = remove(df_norm,index_list)

In [18]:
cleand_df.shape

(21584, 9)

In [19]:
result = pd.concat([cleand_df, dtf], axis=1, join='inner')
display(result)

Unnamed: 0,gender,age_cat,con_age,language_id,total_voice_usage_min,total_data_usage_mb,device_type,total_revenue,vas_revenue,package
0,0.0,0.75,0.666667,0.0,0.008460,9.845378e-02,1.0,0.019094,0.003322,2
2,0.0,0.25,0.000000,0.5,0.015958,5.916630e-02,1.0,0.019646,0.002089,1
3,1.0,0.50,0.666667,0.0,0.014263,1.697807e-06,1.0,0.003819,0.002250,2
4,1.0,0.50,0.000000,0.0,0.001694,1.650209e-01,1.0,0.026097,0.002780,1
6,0.0,0.75,0.666667,0.0,0.091588,0.000000e+00,0.5,0.017776,0.001853,1
...,...,...,...,...,...,...,...,...,...,...
26274,0.0,0.75,1.000000,0.0,0.035557,2.091892e-02,1.0,0.014661,0.002627,0
26276,1.0,0.75,1.000000,0.5,0.024807,0.000000e+00,0.5,0.005106,0.002307,1
26277,1.0,0.75,1.000000,0.5,0.068350,0.000000e+00,0.0,0.012717,0.001670,1
26278,0.0,0.75,0.333333,0.0,0.052699,3.995747e-03,1.0,0.009455,0.002639,1


In [20]:
y = np.array(result.iloc[:,9:])
X = np.array(result.iloc[:,0:9])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [21]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
X, y=smote.fit_resample(X_train, y_train)

In [22]:
KNN = KNeighborsClassifier()    # initialising KNN
NB = GaussianNB()               # initialising NB Classifier
CART = DecisionTreeClassifier() # initialising CART

In [23]:
model_KNN = KNN.fit(X_train, y_train)   
pred_knn = model_KNN.predict(X_test)   

model_NaiveBayes = NB.fit(X_train, y_train)
pred_nb = model_NaiveBayes.predict(X_test)

model_CART = CART.fit(X_train, y_train)
pred_cart = model_CART.predict(X_test)

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


In [24]:
accuracy_knn = accuracy_score(y_test, pred_knn)  # evaluating accuracy score
print('accuracy score of KNeighbors Classifier is:', accuracy_knn * 100)

accuracy_nb = accuracy_score(y_test, pred_nb)
print('Accuracy of Naive Bayes Classifier:', accuracy_nb * 100)

accuracy_cart = accuracy_score(y_test, pred_cart)
print('Accuracy of CART Classifier:', accuracy_cart * 100)

accuracy score of KNeighbors Classifier is: 72.68375540457073
Accuracy of Naive Bayes Classifier: 74.6911673872761
Accuracy of CART Classifier: 73.76466954910438


In [25]:
lr = LogisticRegression()  
clf_stack = StackingClassifier(classifiers =[KNN, NB, CART], meta_classifier = lr, use_probas = True, 
                               use_features_in_secondary = True)

In [26]:
model_stack = clf_stack.fit(X_train, y_train)   # training of stacked model
pred_stack = model_stack.predict(X_test)       # predictions on test data using stacked model

accuracy_stack = accuracy_score(y_test, pred_stack)  # evaluating accuracy
print('accuracy score of Stacked model:', accuracy_stack * 100)

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


accuracy score of Stacked model: 74.33600988264361


In [27]:
import pickle
# pickle.dump(rf, open('randomForest.pkl', 'wb'))
data = {"model": model_stack, "le_age": le_age, "le_device":le_device, "le_language":le_language, "le_conAge":le_conAge, "le_package":le_package}
with open('stack.pkl', 'wb') as file:
    pickle.dump(data, file)