# Exploratory Data analysis

In [78]:
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.metrics import accuracy_score
import math
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

Reading the data from CSV

In [2]:
data = pd.read_csv("C:/Users/shash/Desktop/COursera/Dispatch.csv",header=0)

In [3]:
data.head()

Unnamed: 0,fttpproducttype,wbn_prod_type,ontrequiredind,wbn_ont_required,onttype,premisetype,stbcount,droptype,serviceordertype,classofservicename,...,ismigrate,isnt,isfitof,networkmigrationind,order_type,detailed_order_type,arrivalwindow,diff_cx_onprem,change_state,jobeffortcoeff
0,T,T,Y,Y,SFU,MDU,0,N,C,RES,...,Y,Y,N,N,NEW,MIGRATION,0900-1100,246,C,4.1
1,T,T,Y,Y,SFU,MDU,0,N,C,RES,...,Y,Y,N,N,NEW,MIGRATION,1000-1200,195,C,3.25
2,D,D,Y,Y,SFU,MDU,0,N,N,RES,...,N,N,N,N,NEW,NEW,1300-1500,232,X,3.87
3,D,D,N,Y,SFU,Unknown,0,B,N,RES,...,N,N,N,N,NEW,NEW,1400-1600,95,X,1.58
4,D,D,Y,N,SFU,SFU,0,B,N,RES,...,N,N,N,N,NEW,NEW,1500-1700,96,X,1.6


In [4]:
data.shape

(180647, 37)

In [5]:
data.columns

Index(['fttpproducttype', 'wbn_prod_type', 'ontrequiredind',
       'wbn_ont_required', 'onttype', 'premisetype', 'stbcount', 'droptype',
       'serviceordertype', 'classofservicename', 'dataordertype',
       'videoordertype', 'wirecentergroupid', 'gponind', 'adlflag',
       'hfwsindicator', 'dispatchreason', 'winbackind', 'winbackocn', 'bdvind',
       'bdvnooflines', 'traveltime', 'rewiringrequired', 'swapontind',
       'drcregion', 'ont_status', 'migrateorderind', 'ismigrate', 'isnt',
       'isfitof', 'networkmigrationind', 'order_type', 'detailed_order_type',
       'arrivalwindow', 'diff_cx_onprem', 'change_state', 'jobeffortcoeff'],
      dtype='object')

#### Replacing null values with mode

In [6]:
data.premisetype = data.premisetype.replace('Unknown',np.NaN)
data.wbn_prod_type = data.wbn_prod_type.replace('NONE',np.NaN)
data.stbcount = data.stbcount.replace('X',np.NaN)
data.dispatchreason = data.dispatchreason.replace('None',np.NaN)
data.winbackocn = data.winbackocn.replace('None',np.NaN)

In [7]:
data['premisetype'] = data['premisetype'].fillna(data['premisetype'].mode()[0])
data['wbn_prod_type'] = data['wbn_prod_type'].fillna(data['wbn_prod_type'].mode()[0])
data['stbcount'] = data['stbcount'].fillna(data['stbcount'].mode()[0])
data['dispatchreason'] = data['dispatchreason'].fillna(data['dispatchreason'].mode()[0])
data['winbackocn'] = data['winbackocn'].fillna(data['winbackocn'].mode()[0])

In [8]:
data.shape

(180647, 37)

In [9]:
# converting stbcount to numeric
data["stbcount"] = pd.to_numeric(data["stbcount"])
# converting wirecentergroupid to categorical
data.wirecentergroupid = data.wirecentergroupid.astype(str)

### Preparing data for modelling: Numerical encoding for categorical features

In [10]:
le = preprocessing.LabelEncoder()

data['fttpproducttype'] = le.fit_transform(data['fttpproducttype'])
data['wbn_prod_type'] = le.fit_transform(data['wbn_prod_type'])
data['ontrequiredind'] = le.fit_transform(data['ontrequiredind'])
data['wbn_ont_required'] = le.fit_transform(data['wbn_ont_required'])
data['onttype'] = le.fit_transform(data['onttype'])
data['premisetype'] = le.fit_transform(data['premisetype'])
data['droptype'] = le.fit_transform(data['droptype'])
data['serviceordertype'] = le.fit_transform(data['serviceordertype'])
data['classofservicename'] = le.fit_transform(data['classofservicename'])
data['dataordertype'] = le.fit_transform(data['dataordertype'])
data['videoordertype'] = le.fit_transform(data['videoordertype'])
data['wirecentergroupid'] = le.fit_transform(data['wirecentergroupid'])
data['gponind'] = le.fit_transform(data['gponind'])
data['adlflag'] = le.fit_transform(data['adlflag'])
data['hfwsindicator'] = le.fit_transform(data['hfwsindicator'])
data['dispatchreason'] = le.fit_transform(data['dispatchreason'])
data['winbackind'] = le.fit_transform(data['winbackind'])
data['winbackocn'] = le.fit_transform(data['winbackocn'])
data['bdvind'] = le.fit_transform(data['bdvind'])
data['rewiringrequired'] = le.fit_transform(data['rewiringrequired'])
data['swapontind'] = le.fit_transform(data['swapontind'])
data['drcregion'] = le.fit_transform(data['drcregion'])
data['ont_status'] = le.fit_transform(data['ont_status'])
data['migrateorderind'] = le.fit_transform(data['migrateorderind'])
data['ismigrate'] = le.fit_transform(data['ismigrate'])
data['isnt'] = le.fit_transform(data['isnt'])
data['isfitof'] = le.fit_transform(data['isfitof'])
data['networkmigrationind'] = le.fit_transform(data['networkmigrationind'])
data['order_type'] = le.fit_transform(data['order_type'])
data['detailed_order_type'] = le.fit_transform(data['detailed_order_type'])
data['arrivalwindow'] = le.fit_transform(data['arrivalwindow'])
data['change_state'] = le.fit_transform(data['change_state'])

### Creating buckets for the target variable. We are converting the problem into classification.

In [13]:
data['new_label'] = data['jobeffortcoeff'].apply(lambda x: math.ceil(x))
data['new_label'].value_counts()

3    58594
4    48673
5    29953
2    23062
6    14873
7     5159
8      333
Name: new_label, dtype: int64

## With and Without Dropping the variables which are not necessary

In [20]:
# Without dropping
x = data[data.columns.difference(['jobeffortcoeff','diff_cx_onprem','new_label'])]
y = data['jobeffortcoeff']
x.columns
x.shape

(180647, 35)

In [21]:
data.describe()

Unnamed: 0,fttpproducttype,wbn_prod_type,ontrequiredind,wbn_ont_required,onttype,premisetype,stbcount,droptype,serviceordertype,classofservicename,...,isnt,isfitof,networkmigrationind,order_type,detailed_order_type,arrivalwindow,diff_cx_onprem,change_state,jobeffortcoeff,new_label
count,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,...,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0,180647.0
mean,2.530715,2.528279,0.660504,0.616373,0.990346,2.526795,1.045708,0.727297,0.830349,0.999253,...,0.04015,0.008077,0.006333,0.844514,2.022469,5.446955,201.729926,0.752196,3.362163,3.843839
std,1.799902,1.797871,0.47354,0.48627,0.098007,0.847683,1.338863,0.851672,0.572683,0.027327,...,0.196312,0.089506,0.079327,0.362368,0.581899,4.498967,72.823714,0.431738,1.213728,1.261859
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,91.0,0.0,1.52,2.0
25%,1.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,2.0,0.0,144.0,1.0,2.4,3.0
50%,4.0,4.0,1.0,1.0,1.0,3.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,2.0,6.0,190.0,1.0,3.17,4.0
75%,4.0,4.0,1.0,1.0,1.0,3.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,1.0,2.0,10.0,249.0,1.0,4.15,5.0
max,6.0,6.0,1.0,1.0,2.0,3.0,15.0,2.0,2.0,1.0,...,1.0,1.0,1.0,1.0,3.0,16.0,449.0,1.0,7.48,8.0


In [22]:
data.stbcount.value_counts()

0     90794
1     34190
2     28374
3     16283
4      7571
5      2368
6       683
7       259
8        77
9        26
10       15
11        5
15        1
12        1
Name: stbcount, dtype: int64

In [23]:
data.shape

(180647, 38)

In [27]:
x.shape, y.shape

((180647, 35), (180647,))

#### Splitting the data into test and train 

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((121033, 35), (59614, 35), (121033,), (59614,))

In [34]:
# Linear regression
linearRegressor = LinearRegression()
linearRegressor.fit(X_train, y_train)
yPrediction = linearRegressor.predict(X_test)
yPrediction
accuracy = linearRegressor.score(X_test,y_test)
print(accuracy*100,'%')


34.5812826259 %


In [35]:
# With dropping variables
# Without dropping
x = data[data.columns.difference(['jobeffortcoeff','diff_cx_onprem','new_label','hfwsindicator','bdvind','swapontind','onttype','premisetype','migrateorderind','ismigrate','isnt','wbn_prod_type'])]
y = data['jobeffortcoeff']
x.columns
x.shape

(180647, 26)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((121033, 26), (59614, 26), (121033,), (59614,))

In [38]:
# Linear regression
linearRegressor = LinearRegression()
linearRegressor.fit(X_train, y_train)
yPrediction = linearRegressor.predict(X_test)
yPrediction
accuracy = linearRegressor.score(X_test,y_test)
print(accuracy*100,'%')


33.735178778 %


# Classification problem

### Model GBDT

In [61]:
# Without dropping
x = data[data.columns.difference(['jobeffortcoeff','diff_cx_onprem','new_label'])]
y = data['new_label']
x.columns
x.shape

(180647, 35)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [63]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((121033, 35), (59614, 35), (121033,), (59614,))

In [42]:
clf = GradientBoostingClassifier(random_state=0)

In [44]:
clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [45]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

In [46]:
feature_importances

Unnamed: 0,importance
wirecentergroupid,0.14953
drcregion,0.119879
stbcount,0.116163
arrivalwindow,0.070285
wbn_prod_type,0.069169
wbn_ont_required,0.056806
videoordertype,0.056638
dispatchreason,0.036577
winbackocn,0.034237
droptype,0.033511


In [47]:
y_pred = clf.predict(X_test)

In [48]:
clf.classes_

array([2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [49]:
accuracy_score(y_test, y_pred)

0.41379541718388296

In [50]:
y_test.value_counts()

3    19537
4    16025
5     9799
2     7598
6     4842
7     1704
8      109
Name: new_label, dtype: int64

In [51]:
print(classification_report(y_test, y_pred, target_names=['2', '3', '4', '5', '6', '7', '8']))

             precision    recall  f1-score   support

          2       0.47      0.15      0.23      7598
          3       0.46      0.68      0.55     19537
          4       0.37      0.43      0.40     16025
          5       0.33      0.26      0.29      9799
          6       0.36      0.16      0.22      4842
          7       0.34      0.01      0.01      1704
          8       0.00      0.00      0.00       109

avg / total       0.40      0.41      0.38     59614



  'precision', 'predicted', average, warn_for)


In [None]:
# With excluding the variables
# With dropping variables
# Without dropping
x = data[data.columns.difference(['jobeffortcoeff','diff_cx_onprem','new_label','hfwsindicator','bdvind','swapontind','onttype','premisetype','migrateorderind','ismigrate','isnt','wbn_prod_type'])]
y = data['new_label']
x.columns
x.shape

In [52]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((121033, 35), (59614, 35), (121033,), (59614,))

In [54]:
clf = GradientBoostingClassifier(random_state=0)

In [55]:
clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [56]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

In [57]:
y_pred = clf.predict(X_test)

In [58]:
accuracy_score(y_test, y_pred)

0.41379541718388296

### Model KNN

In [64]:
from sklearn.neighbors import KNeighborsClassifier

In [65]:
neigh = KNeighborsClassifier(n_neighbors=60)

In [66]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=60, p=2,
           weights='uniform')

In [67]:
pred = neigh.predict(X_test)

In [68]:
accuracy_score(y_test, pred)

0.39485691280571678

In [69]:
print(classification_report(y_test, pred, target_names=['2', '3', '4', '5', '6', '7', '8']))

             precision    recall  f1-score   support

          2       0.42      0.11      0.17      7598
          3       0.44      0.69      0.54     19537
          4       0.35      0.38      0.37     16025
          5       0.32      0.25      0.28      9799
          6       0.33      0.16      0.21      4842
          7       0.21      0.00      0.01      1704
          8       0.00      0.00      0.00       109

avg / total       0.38      0.39      0.36     59614



  'precision', 'predicted', average, warn_for)


observation : accuracy increases with number of neighbors. But stops increasing after 40

### Model Random forest

In [70]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

In [71]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [72]:
y_pred = clf.predict(X_test)

In [73]:
accuracy_score(y_test, y_pred)

0.36935954641527158

In [74]:
print(classification_report(y_test, y_pred, target_names=['2', '3', '4', '5', '6', '7', '8']))

             precision    recall  f1-score   support

          2       0.00      0.00      0.00      7598
          3       0.40      0.87      0.54     19537
          4       0.30      0.31      0.31     16025
          5       0.00      0.00      0.00      9799
          6       0.00      0.00      0.00      4842
          7       0.00      0.00      0.00      1704
          8       0.00      0.00      0.00       109

avg / total       0.21      0.37      0.26     59614



  'precision', 'predicted', average, warn_for)


In [75]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

In [76]:
feature_importances

Unnamed: 0,importance
wbn_prod_type,0.203861
drcregion,0.130693
stbcount,0.12137
fttpproducttype,0.10912
ont_status,0.082448
videoordertype,0.067634
wbn_ont_required,0.063584
ontrequiredind,0.054596
wirecentergroupid,0.031845
winbackind,0.028162


### Model Logistic regression

In [79]:
model = LogisticRegression()
model.fit(X_train, y_train)
L_Pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, L_Pred)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, L_Pred)

0.40002348441641228