In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dataset = pd.read_csv('msc_data_cleaned_new1.csv')

In [2]:
def clean_age(x):
    if '51-60' in x or '60+' in x:
        return '51-60'
    if '41-45' in x or '46-50' in x:
        return '41-50'
    if '31-35' in x or '36-40' in x:
        return '31-40'
    if '21-25' in x or '26-30' in x:
        return '21-30'
    return 'Under 20'

dataset['age_cat'] = dataset['age_cat'].apply(clean_age)

In [3]:
dataset = dataset.dropna(subset=['device_type'])

In [4]:
def clean_deviceType(x):
    if 'BASIC' in x:
        return 'BASIC'
    if 'SMART' in x:
        return 'SMART'
    if 'FEATURE' in x:
        return 'FEATURE'

dataset['device_type'] = dataset['device_type'].apply(clean_deviceType)

In [5]:
sin= 2078.0
eng = 2002.0
# tam = 2079.0
def clean_language(x):
    if sin in [x]:
        return 'Sinhala'
    if eng in [x]:
        return 'English'
    return 'Tamil'

dataset['language_id'] = dataset['language_id'].apply(clean_language)

In [6]:
def clean_connectionAge(x):
    if 'more than 5 years' in x:
        return 'more than 5 years'
    if '3-5 years' in x:
        return '3-5 years'
    if '2-3 years' in x or '1-2 years' in x:
        return '1-3 years'
    return 'less than 1'

dataset['con_age'] = dataset['con_age'].apply(clean_connectionAge)

In [7]:
from sklearn.preprocessing import LabelEncoder
# labelEncoder = LabelEncoder()
dataTransform = dataset.copy()

In [8]:
le_language = LabelEncoder()
dataTransform['language_id'] = le_language.fit_transform(dataset['language_id'])
le_age = LabelEncoder()
dataTransform['age_cat'] = le_age.fit_transform(dataset['age_cat'])
le_conAge = LabelEncoder()
dataTransform['con_age'] = le_conAge.fit_transform(dataset['con_age'])
le_device = LabelEncoder()
dataTransform['device_type'] = le_device.fit_transform(dataset['device_type'])
le_package = LabelEncoder()
dataTransform['package'] = le_package.fit_transform(dataset['package'])
dataTransform["package"].unique()

array([2, 1, 0])

In [9]:
dataTransform.shape

(21781, 10)

In [10]:
dtf = dataTransform['package']
# Min-Max Normalization
df = dataTransform.drop('package', axis=1)
df_norm = (df-df.min())/(df.max()-df.min())
df_norm.head()

Unnamed: 0,gender,age_cat,con_age,language_id,total_voice_usage_min,total_data_usage_mb,device_type,total_revenue,vas_revenue
0,0.0,0.75,0.666667,0.0,0.00846,0.098454,1.0,0.019094,0.003322
2,0.0,0.25,0.0,0.5,0.015958,0.059166,1.0,0.019646,0.002089
3,1.0,0.5,0.666667,0.0,0.014263,2e-06,1.0,0.003819,0.00225
4,1.0,0.5,0.0,0.0,0.001694,0.165021,1.0,0.026097,0.00278
6,0.0,0.75,0.666667,0.0,0.091588,0.0,0.5,0.017776,0.001853


In [11]:
def outliers(df,feature):
    Q1 = df[feature].quantile(0.05)
    Q3 = df[feature].quantile(0.95)
    IQR = Q3 - Q1
    
    lowerBound = Q1 - 1.5*IQR
    uperBound  = Q3 + 1.5*IQR
    
    ls = df.index[(df[feature]<lowerBound) | (df[feature]>uperBound)]
    return ls

In [12]:
index_list = []
for f in ['total_voice_usage_min','total_data_usage_mb','total_revenue']:
    index_list.extend(outliers(df_norm,f))

In [13]:
def remove(df,ls):
    ls = sorted(set(ls))
    df =df.drop(ls)
    return df

In [14]:
cleand_df = remove(df_norm,index_list)

In [15]:
cleand_df.shape

(21584, 9)

In [16]:
result = pd.concat([cleand_df, dtf], axis=1, join='inner')
display(result)

Unnamed: 0,gender,age_cat,con_age,language_id,total_voice_usage_min,total_data_usage_mb,device_type,total_revenue,vas_revenue,package
0,0.0,0.75,0.666667,0.0,0.008460,9.845378e-02,1.0,0.019094,0.003322,2
2,0.0,0.25,0.000000,0.5,0.015958,5.916630e-02,1.0,0.019646,0.002089,1
3,1.0,0.50,0.666667,0.0,0.014263,1.697807e-06,1.0,0.003819,0.002250,2
4,1.0,0.50,0.000000,0.0,0.001694,1.650209e-01,1.0,0.026097,0.002780,1
6,0.0,0.75,0.666667,0.0,0.091588,0.000000e+00,0.5,0.017776,0.001853,1
...,...,...,...,...,...,...,...,...,...,...
26274,0.0,0.75,1.000000,0.0,0.035557,2.091892e-02,1.0,0.014661,0.002627,0
26276,1.0,0.75,1.000000,0.5,0.024807,0.000000e+00,0.5,0.005106,0.002307,1
26277,1.0,0.75,1.000000,0.5,0.068350,0.000000e+00,0.0,0.012717,0.001670,1
26278,0.0,0.75,0.333333,0.0,0.052699,3.995747e-03,1.0,0.009455,0.002639,1


In [17]:
y = np.array(result.iloc[:,9])
X = np.array(result.iloc[:,0:9])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [18]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
X, y=smote.fit_resample(X_train, y_train)

In [19]:
clf = LogisticRegression()

logisticRegression = clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.762816553428042


In [21]:
import pickle
# pickle.dump(rf, open('randomForest.pkl', 'wb'))
data = {"model": clf, "le_age": le_age, "le_device":le_device, "le_language":le_language, "le_conAge":le_conAge, "le_package":le_package}
with open('logisticRegression.pkl', 'wb') as file:
    pickle.dump(data, file)