# DATAQUEST 

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

## Importing libraries 

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor , RandomForestClassifier
import re
import seaborn as sns

from sklearn import metrics
import io

## Getting the data in Google Colab

In [3]:
PATH = './data/'
!ls {PATH}

sample_submission.csv  test.csv  train.csv


In [4]:
train_raw = pd.read_csv(f'{PATH}train.csv',low_memory=False,parse_dates=['opened_at'])

In [5]:
test_raw = pd.read_csv(f'{PATH}test.csv',low_memory=False,parse_dates=['opened_at'])

## Setting some lists to measure progress 

In [6]:
train_map = []
test_map = []

##  Fuctions

In [7]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [8]:
def print_score(m):
    print(metrics.mean_absolute_error(m.predict(X_train),y_train),metrics.mean_absolute_error(m.predict(X_valid),y_valid))

In [9]:
def put_in_metrics(n):
    train_map.append(metrics.mean_absolute_error(n.predict(X_train),y_train))
    test_map.append(metrics.mean_absolute_error(n.predict(X_valid),y_valid))

In [10]:
def split_vals(a,n): return a[:n].copy() , a[n:].copy()

## EDA (DATA PREPROCESSING) 

In [11]:
train_raw.head().T

Unnamed: 0,0,1,2,3,4
Id,INC0000045,INC0000047,INC0000057,INC0000060,INC0000062
reassignment_count,0,0,0,0,0
reopen_count,0,0,0,0,0
update_count,0,0,0,0,0
made_sla,True,True,True,True,True
opened_by,Opened by 8,Opened by 397,Opened by 8,Opened by 180,Opened by 180
opened_at,2016-02-29 01:16:00,2016-02-29 04:40:00,2016-02-29 06:10:00,2016-02-29 06:38:00,2016-02-29 06:58:00
contact_type,Phone,Phone,Phone,Phone,Phone
location,Location 143,Location 165,Location 204,Location 204,Location 93
category,Category 55,Category 40,Category 20,Category 9,Category 53


In [12]:
train_raw.shape , test_raw.shape

((20000, 19), (4918, 18))

In [13]:
train_raw.dtypes

Id                            object
reassignment_count             int64
reopen_count                   int64
update_count                   int64
made_sla                        bool
opened_by                     object
opened_at             datetime64[ns]
contact_type                  object
location                      object
category                      object
subcategory                   object
impact                        object
urgency                       object
priority                      object
assigned_to                   object
knowledge                       bool
notify                        object
vendor                        object
target_days                    int64
dtype: object

##  Dropping columns (based on their interdependecy and not use)

In [14]:
train_raw.drop(columns=['reassignment_count','reopen_count','Id','vendor'],inplace=True)

## Removing outliers

#### Outliers in update_count, opened_by

In [15]:
train_raw.drop(train_raw[train_raw['update_count'] !=0].index,inplace=True)

In [16]:
train_raw.update_count.value_counts()

0    19995
Name: update_count, dtype: int64

####  The columns opened_by,location,category,subcategory can be made integer directly by stripping away the alphacharacters .  The columns impact,urgency,priority could be splitted.

In [17]:
add_datepart(train_raw,'opened_at')

## Splitting column data 

In [18]:
# train_raw['opened_by'] = train_raw['opened_by'].str.split(expand=True)[2]
#  train_raw.opened_by.astype('int')

In [19]:
# train_raw['opened_by'].isnull().sum()

In [20]:
# train_raw.dtypes

In [21]:
# train_raw.opened_by.astype('int',errors='ignore')

In [22]:
# train_raw['opened_by_na'] = train_raw['opened_by'].copy().isna()

In [23]:
# train_raw[train_raw['opened_by'].isna()] = 0 

In [24]:
# train_raw.opened_by_na.value_counts()

#### Now since we have many null values and we got to deal with that. We could do one hot encoding . Make a column of opened_by_na and then give 1 to value having NaN values and 0 to rest . And then in opened_by column we could make all of them as 0. Maybe this is all is pre implemented in a function in fastai . Got to check that. 

In [25]:
train_cats(train_raw)

In [26]:
train_raw.impact.cat.set_categories(['3 - Low','2 - Medium','1 - High'],ordered=True,inplace=True)
train_raw.urgency.cat.set_categories(['3 - Low','2 - Medium','1 - High'],ordered=True,inplace=True)
train_raw.priority.cat.set_categories(train_raw.priority.cat.categories[::-1],ordered=True,inplace=True)

In [27]:
train_raw.isnull().sum().sort_index()/len(train_raw)

assigned_to                  0.270368
category                     0.001300
contact_type                 0.000000
impact                       0.000000
knowledge                    0.000000
location                     0.002601
made_sla                     0.000000
notify                       0.000000
opened_atDay                 0.000000
opened_atDayofweek           0.000000
opened_atDayofyear           0.000000
opened_atElapsed             0.000000
opened_atIs_month_end        0.000000
opened_atIs_month_start      0.000000
opened_atIs_quarter_end      0.000000
opened_atIs_quarter_start    0.000000
opened_atIs_year_end         0.000000
opened_atIs_year_start       0.000000
opened_atMonth               0.000000
opened_atWeek                0.000000
opened_atYear                0.000000
opened_by                    0.035709
priority                     0.000000
subcategory                  0.002801
target_days                  0.000000
update_count                 0.000000
urgency     

# Preprocessing

In [28]:
def preprocessing(df_raw):
    df = df_raw.copy()
    df.drop(columns=['reassignment_count','reopen_count','Id','vendor'],inplace=True)
    add_datepart(df,'opened_at')
    df['opened_by'] = df['opened_by'].str.split(expand=True)[2]
    return df

## Data after preprocessing 

In [29]:
display_all(train_raw.head().T)

Unnamed: 0,0,1,2,3,4
update_count,0,0,0,0,0
made_sla,True,True,True,True,True
opened_by,Opened by 8,Opened by 397,Opened by 8,Opened by 180,Opened by 180
contact_type,Phone,Phone,Phone,Phone,Phone
location,Location 143,Location 165,Location 204,Location 204,Location 93
category,Category 55,Category 40,Category 20,Category 9,Category 53
subcategory,Subcategory 170,Subcategory 215,Subcategory 125,Subcategory 97,Subcategory 168
impact,2 - Medium,2 - Medium,2 - Medium,2 - Medium,2 - Medium
urgency,2 - Medium,2 - Medium,2 - Medium,2 - Medium,2 - Medium
priority,3 - Moderate,3 - Moderate,3 - Moderate,3 - Moderate,3 - Moderate


# Preparing model

In [30]:
train , y, nas = proc_df(train_raw,'target_days')

In [31]:
n_valid = 3000 
n_trn = len(train) - n_valid
X_train , X_valid = split_vals(train,n_trn)
y_train , y_valid = split_vals(y,n_trn)

X_train.shape , X_valid.shape , y_train.shape , y_valid.shape

((16995, 26), (3000, 26), (16995,), (3000,))

In [32]:
#m = RandomForestClassifier(n_estimators=10,n_jobs=-1,random_state=3,min_samples_leaf=1)
#m.fit(X_train,y_train)

In [33]:
#print_score(m)

In [34]:
from xgboost import XGBClassifier

In [35]:
model = XGBClassifier(n_estimators=100,learning_rate=0.055)

In [36]:
#model.fit(X_train,y_train)

In [37]:
# print_score(model)

In [38]:
import keras

Using TensorFlow backend.


In [39]:
from keras.models import Sequential # intitialize the ANN
from keras.layers import Dense

In [40]:
t = Sequential()

In [41]:
t.add(Dense(units = 26, kernel_initializer = 'uniform', activation = 'relu', input_dim = 17))
t.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))
t.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
t.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.


In [42]:
t.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [43]:
X_train.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16990,16991,16992,16993,16994,16995,16996,16997,16998,16999
update_count,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
made_sla,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
opened_by,148,92,148,25,25,25,13,13,45,25,...,141,64,22,45,11,137,4,40,45,141
contact_type,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
location,36,50,87,87,189,189,36,5,48,36,...,87,5,87,36,189,36,182,48,36,87
category,36,24,5,44,34,5,34,28,36,44,...,26,7,10,10,7,17,7,21,10,10
subcategory,60,98,25,213,57,25,57,108,75,11,...,103,64,65,65,159,64,64,32,65,65
impact,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
urgency,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
priority,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [44]:
t.fit(X_train, y_train, batch_size = 32, epochs = 200)

ValueError: Error when checking input: expected dense_1_input to have shape (17,) but got array with shape (26,)

# TEST 

In [None]:
Id = test_raw['Id']

In [None]:
test_raw = preprocessing(test_raw)

In [None]:
apply_cats(test_raw,train_raw)

In [None]:
test,_,_=proc_df(test_raw)

In [None]:
test.shape , train.shape

In [None]:
y_test = model.predict(test)

In [None]:
dict = {
    'Id':Id,
    'target_days':y_test
}

In [None]:
submit = pd.DataFrame(dict)

In [None]:
submit.to_csv('submit2.csv',header=True,index=False)

In [None]:
#files.download('submit2.csv')